Moved tests

This commit is contained in:
Georgy Litvinov 2020-03-11 12:05:18 +01:00
parent 720195428f
commit 3581998f01
13 changed files with 6 additions and 705 deletions

View file

@ -1,48 +0,0 @@
/************************************************************************
*
* Catcode.java
*
* Copyright: 2002-2009 by Henrik Just
*
* This file is part of Writer2LaTeX.
*
* Writer2LaTeX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Writer2LaTeX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Writer2LaTeX. If not, see <http://www.gnu.org/licenses/>.
*
* Version 1.2 (2009-06-11)
*
*/
package org.openoffice.da.comp.w2lcommon.tex.tokenizer;
/** This enumerates TeX category codes (catcodes) for characters as defined in
* chapter 7 of "The TeXbook"
*/
public enum Catcode {
ESCAPE,
BEGIN_GROUP,
END_GROUP,
MATH_SHIFT,
ALIGNMENT_TAB,
END_OF_LINE,
PARAMETER,
SUPERSCRIPT,
SUBSCRIPT,
IGNORED,
SPACE,
LETTER,
OTHER,
ACTIVE,
COMMENT,
INVALID;
}

View file

@ -1,94 +0,0 @@
/************************************************************************
*
* CatcodeTable.java
*
* Copyright: 2002-2009 by Henrik Just
*
* This file is part of Writer2LaTeX.
*
* Writer2LaTeX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Writer2LaTeX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Writer2LaTeX. If not, see <http://www.gnu.org/licenses/>.
*
* Version 1.2 (2009-06-11)
*
*/
package org.openoffice.da.comp.w2lcommon.tex.tokenizer;
/** This class maintains a mapping from characters to catcodes.
* In this implementation, non-ascii characters always has the
* category Catcode.OTHER.
*/
public class CatcodeTable {
private Catcode[] catcodes;
/** Construct a new <code>CatcodeTable</code>, defining catcodes
* as by INITeX plus the additional catcodes defined by plain TeX
*/
public CatcodeTable() {
catcodes = new Catcode[128];
// First define all the catcodes from INITeX (Chapter 7 in "The TeXbook")
for (int i=0; i<128; i++) {
catcodes[i] = Catcode.OTHER;
}
for (char c='A'; c<='Z'; c++) {
catcodes[c] = Catcode.LETTER;
}
for (char c='a'; c<='z'; c++) {
catcodes[c] = Catcode.LETTER;
}
catcodes['\r']=Catcode.END_OF_LINE;
catcodes[' ']=Catcode.SPACE;
catcodes['\u0000']=Catcode.IGNORED; // ASCII NUL
catcodes['\u007F']=Catcode.INVALID; // ASCII DEL
catcodes['%']=Catcode.COMMENT;
catcodes['\\']=Catcode.ESCAPE;
// Then define all the catcodes from plain TeX (Appendix B in "The TeXbook")
catcodes['{']=Catcode.BEGIN_GROUP;
catcodes['}']=Catcode.END_GROUP;
catcodes['$']=Catcode.MATH_SHIFT;
catcodes['&']=Catcode.ALIGNMENT_TAB;
catcodes['#']=Catcode.PARAMETER;
catcodes['^']=Catcode.SUPERSCRIPT;
catcodes['\u000B']=Catcode.SUPERSCRIPT; // ASCII VT ("uparrow")
catcodes['_']=Catcode.SUBSCRIPT;
catcodes['\u0001']=Catcode.SUBSCRIPT; // ASCII SOH ("downarrow")
catcodes['\t']=Catcode.SPACE;
catcodes['~']=Catcode.ACTIVE;
catcodes['\u000C']=Catcode.ACTIVE; // ASCII FF
}
/** Set the catcode of a character. The request is silently ignored
* for all characters outside the ASCII character set
*
* @param c the character
* @param cc the desired catcode
*/
public void set(char c, Catcode cc) {
if (c<128) { catcodes[c]=cc; }
}
/** Get the catcode of a character. Characters outside the ASCII character
* set always have the catcode Catcode.OTHER
*
* @param c the character
* @return the current catcode
*/
public Catcode get(char c) {
if (c<128) { return catcodes[c]; }
else { return Catcode.OTHER; }
}
}

View file

@ -1,342 +0,0 @@
/************************************************************************
*
* Mouth.java
*
* Copyright: 2002-2010 by Henrik Just
*
* This file is part of Writer2LaTeX.
*
* Writer2LaTeX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Writer2LaTeX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Writer2LaTeX. If not, see <http://www.gnu.org/licenses/>.
*
* Version 1.2 (2010-10-25)
*
*/
package org.openoffice.da.comp.w2lcommon.tex.tokenizer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
enum State {
N, // new line
M, // middle of line
S; // ignoring spaces
}
/** <p>The Mouth is the main class of this package. It is a tokenizer to TeX files: According to "The TeXBook", the
* "eyes" and "mouth" of TeX are responsible for turning the input to TeX into a sequence of tokens.
* We are not going to reimplement TeX, but rather providing a service for parsing high-level languages based on
* TeX (eg. LaTeX, ConTeXt). For this reason the tokenizer deviates slightly from TeX: We're not reading a stream
* of bytes but rather a stream of characters (which makes no difference for ASCII files).</p>
*
* <p>In tribute to Donald E. Knuths digestive metaphors, we divide the process in four levels</p>
* <ul>
* <li>The parser should provide a <em>pair of glasses</em> to translate the stream of bytes into a stream of characters</li>
* <li>The <em>eyes</em> sees the stream of characters as a sequence of lines</li>
* <li>The <em>mouth</em> chews a bit on the characters to turn them into tokens</li>
* <li>The <em>tongue</em> reports the "taste" of the token to the parser</li>
* </ul>
*/
public class Mouth {
private Reader reader; // The input
private CatcodeTable catcodes; // The current catcode table
private char cEndlinechar; // The current value of \endlinechar
private Token token; // The token object
private State state; // The current state of the tokenizer
private Eyes eyes; // sic!
/** Construct a new <code>Mouth</code> based on a character stream
*
* @param reader the character stream to tokenize
* @throws IOException if we fail to read the character stream
*/
public Mouth(Reader reader) throws IOException {
this.reader = reader;
catcodes = new CatcodeTable();
cEndlinechar = '\r';
token = new Token();
state = State.N;
eyes = new Eyes();
}
private class Eyes {
private BufferedReader br; // The input
private String sLine; // The current line
private int nLen; // The length of the current line
private int nIndex; // The current index in the current line
Eyes() throws IOException {
br = new BufferedReader(reader);
nextLine();
}
/** Start looking at the next line of input
*
* @throws IOException if we fail to read the underlying stream
*/
void nextLine() throws IOException {
sLine = br.readLine();
if (sLine!=null) {
nLen = sLine.length();
nIndex = 0;
// Delete trailing spaces
while (nLen>0 && sLine.charAt(nLen-1)==' ') { nLen--; }
}
else { // end of stream
nLen = 0;
nIndex = 1;
}
}
/** Test whether the eyes are looking at a character
*
* @return true if the current line still has characters to look at
*/
boolean lookingAtChar() {
return nIndex<=nLen;
}
/** Test whether the eyes a looking at a line
*
* @return true if a current line is available
*/
boolean lookingAtLine() {
return sLine!=null;
}
/** Get the character that the eyes currently sees
*
* @return the character or U+FFFF if the eyes are not looking at a character
*/
char peekChar() {
return getChar(false);
}
/** Get the character that the eyes currently sees and start looking at the next character
*
* @return the character or U+FFFF if the eyes are not looking at a character
*/
char getChar() {
return getChar(true);
}
private char getChar(boolean bMove) {
if (nIndex<nLen) {
char c = sLine.charAt(nIndex);
if (catcodes.get(c)==Catcode.SUPERSCRIPT && nIndex+2<nLen && catcodes.get(sLine.charAt(nIndex+1))==Catcode.SUPERSCRIPT) {
// Found ^^ and at least one more character
char c1 = sLine.charAt(nIndex+2);
if (nIndex+3<nLen && isHex(c1)) {
char c2 = sLine.charAt(nIndex+3);
if (isHex(c2)) {
// Found ^^ and a lower case hexadecimal number
if (bMove) { nIndex+=4; }
char[] digits = {c1, c2};
return (char) Integer.parseInt(new String(digits), 16);
}
}
else if (c1<128) {
// Found ^^ and an ASCII character
if (bMove) { nIndex+=3; }
if (c1<64) { return (char)(c1+64); }
else { return (char)(c1-64); }
}
}
// Found an ordinary character!
if (bMove) { nIndex++; }
return c;
}
else if (nIndex==nLen) {
// Add \endlinechar at the end of the line
if (bMove) { nIndex++; }
return cEndlinechar;
}
else {
// No more characters on the current line
return '\uFFFF';
}
}
private boolean isHex(char c) {
return ('0'<=c && c<='9') || ('a'<=c && c<='z');
}
}
/** Get the currently used catcode table
*
* @return the table
*/
public CatcodeTable getCatcodes() {
return catcodes;
}
/** Set the catcode table. The catcode table can be changed at any time during tokenization.
*
* @param catcodes the table
*/
public void setCatcodes(CatcodeTable catcodes) {
this.catcodes = catcodes;
}
/** Return the current value of the \endlinechar (the character added to the end of each input line)
*
* @return the character
*/
public char getEndlinechar() {
return cEndlinechar;
}
/** Set a new \endlinechar (the character added to the end of each input line). The character can be changed at
* any time during tokenization.
*
* @param c the character
*/
public void setEndlinechar(char c) {
cEndlinechar = c;
}
/** Return the object used to store the current token (the "tongue" of TeX).
* The same object is reused for all tokens, so for convenience the parser can keep a reference to the object.
* If on the other hand the parser needs to store a token list, it must explicitly clone all tokens.
*
* @return the token
*/
public Token getTokenObject() {
return token;
}
/** Get the next token
*
* @return the token (for convenience; the same object is returned by {@link Mouth#getTokenObject}).
* @throws IOException if we fail to read the underlying stream
*/
public Token getToken() throws IOException {
while (eyes.lookingAtLine()) {
while (eyes.lookingAtChar()) {
char c = eyes.getChar();
switch (catcodes.get(c)) {
case ESCAPE:
token.setType(TokenType.COMMAND_SEQUENCE);
token.clearChars();
// TODO: The description in the TeXBook is not completely clear (to me anyway),
// (as long as \r and no other character has catcode END_OF_LINE this should be correct)
if (catcodes.get(eyes.peekChar())==Catcode.LETTER) {
state = State.S;
while (eyes.lookingAtChar() && catcodes.get(eyes.peekChar())==Catcode.LETTER) {
token.addChar(eyes.getChar());
}
}
else if (catcodes.get(eyes.peekChar())==Catcode.SPACE) {
state = State.S;
token.setChar(eyes.getChar());
}
else if (catcodes.get(eyes.peekChar())!=Catcode.END_OF_LINE) {
state = State.M;
token.setChar(eyes.getChar());
}
else {
// Empty control sequence
state = State.M;
}
return token;
case BEGIN_GROUP:
state = State.M;
token.set(c, TokenType.BEGIN_GROUP);
return token;
case END_GROUP:
state = State.M;
token.set(c, TokenType.END_GROUP);
return token;
case MATH_SHIFT:
state = State.M;
token.set(c, TokenType.MATH_SHIFT);
return token;
case ALIGNMENT_TAB:
state = State.M;
token.set(c, TokenType.ALIGNMENT_TAB);
return token;
case END_OF_LINE:
// Skip rest of line
while (eyes.lookingAtChar()) { eyes.getChar(); }
switch (state) {
case N:
// This terminates an empty line -> insert a \par
token.setType(TokenType.COMMAND_SEQUENCE);
token.clearChars();
token.addChar('p');
token.addChar('a');
token.addChar('r');
return token;
case M:
// Replace with a space token
token.set(' ', TokenType.SPACE);
return token;
case S:
// ignore the character
}
break;
case PARAMETER:
state = State.M;
token.set(c, TokenType.PARAMETER);
return token;
case SUPERSCRIPT:
state = State.M;
token.set(c, TokenType.SUPERSCRIPT);
return token;
case SUBSCRIPT:
state = State.M;
token.set(c, TokenType.SUBSCRIPT);
return token;
case IGNORED:
// ignore this character
break;
case SPACE:
if (state==State.M) {
state=State.S;
token.set(' ', TokenType.SPACE);
return token;
}
// In state N and S the space character is ignored
break;
case LETTER:
state = State.M;
token.set(c, TokenType.LETTER);
return token;
case OTHER:
state = State.M;
token.set(c, TokenType.OTHER);
return token;
case ACTIVE:
state = State.M;
token.set(c, TokenType.ACTIVE);
return token;
case COMMENT:
// Skip rest of line
while (eyes.lookingAtChar()) { eyes.getChar(); }
break;
case INVALID:
// ignore this character (should issue an error message, but we ignore that)
}
}
eyes.nextLine();
state = State.N;
}
// Nothing more to read
token.setType(TokenType.ENDINPUT);
token.clearChars();
return token;
}
}

View file

@ -1,166 +0,0 @@
/************************************************************************
*
* Token.java
*
* Copyright: 2002-2010 by Henrik Just
*
* This file is part of Writer2LaTeX.
*
* Writer2LaTeX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Writer2LaTeX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Writer2LaTeX. If not, see <http://www.gnu.org/licenses/>.
*
* Version 1.2 (2010-10-25)
*
*/
package org.openoffice.da.comp.w2lcommon.tex.tokenizer;
/** This class represent a token in TeX
*/
public class Token implements Cloneable {
private TokenType type;
private char[] tokenChars;
private int nTokenLen;
private int nCapacity;
/** Construct a new <code>Token</code>, initialized as a <code>TokenTYPE.ENDINPUT</code>-token
*/
public Token() {
type = TokenType.ENDINPUT;
tokenChars = new char[25];
nCapacity = 25;
nTokenLen = 0;
}
/** Set the type of this token to a specific <code>TokenType</code>
* (the character content is not changed)
*
* @param type the new <code>TokenType</code>
*/
protected void setType(TokenType type) {
this.type = type;
}
/** Set the character content of this token to a single character
* (the type of the token is not changed)
*
* @param c the character
*/
protected void setChar(char c) {
tokenChars[0] = c;
nTokenLen = 1;
}
/** Set this token as a character token with a specific <code>TokenType</code>
*
* @param c the character
* @param type the <code>TokenType</code> to use
*/
protected void set(char c, TokenType type) {
setType(type);
setChar(c);
}
/** Delete the character content of this token
*/
protected void clearChars() {
nTokenLen = 0;
}
/** Append a character to the character content of this token
*
* @param c the character to be appended
*/
protected void addChar(char c) {
if (nTokenLen == nCapacity) {
char[] temp = tokenChars;
nCapacity+=25;
tokenChars = new char[nCapacity];
System.arraycopy(temp, 0, tokenChars, 0, temp.length);
}
tokenChars[nTokenLen++] = c;
}
/** Test wether this token is a character token of the given type (that is, a single character
* with a token type that is neither <code>COMMAND_SEQUENCE</code> nor <code>ENDINPUT</code>)
*
* @param c the character to test
* @param type the <code>TokenType</code> to test
* @return true if the test was successful
*/
public boolean is(char c, TokenType type) {
return this.type==type && type!=TokenType.COMMAND_SEQUENCE && type!=TokenType.ENDINPUT &&
nTokenLen==1 && tokenChars[0]==c;
}
/** Test wether this token is a <code>COMMAND_SEQUENCE</code> token with a given name
*
* @param sName the name of the command sequence
* @return true if the test was successful
*/
public boolean isCS(String sName) {
if (type==TokenType.COMMAND_SEQUENCE && sName.length()==nTokenLen) {
for (int i=0; i<nTokenLen; i++) {
if (sName.charAt(i)!=tokenChars[i]) { return false; }
}
return true;
}
return false;
}
/** Get the <code>TokenType</code> of this token
*
* @return the type
*/
public TokenType getType() {
return type;
}
/** Get the first character in this token
*
* @return the character or U+FFFF is no characters exist
*/
public char getChar() {
return nTokenLen>0 ? tokenChars[0] : '\uFFFF';
}
/** Get the character content of this token as a string
*
* @return the character content
*/
public String getString() {
return new String(tokenChars,0,nTokenLen);
}
@Override public String toString() {
switch (type) {
case COMMAND_SEQUENCE:
return "\\"+getString();
case ENDINPUT:
return "<EOF>";
default:
return Character.toString(getChar());
}
}
@Override public Object clone() {
Token newToken = new Token();
newToken.type = this.type;
newToken.nTokenLen = this.nTokenLen;
newToken.nCapacity = this.nCapacity;
newToken.tokenChars = new char[newToken.nCapacity];
System.arraycopy(this.tokenChars, 0, newToken.tokenChars, 0, newToken.nCapacity);
return newToken;
}
}

View file

@ -1,49 +0,0 @@
/************************************************************************
*
* TokenType.java
*
* Copyright: 2002-2009 by Henrik Just
*
* This file is part of Writer2LaTeX.
*
* Writer2LaTeX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Writer2LaTeX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Writer2LaTeX. If not, see <http://www.gnu.org/licenses/>.
*
* Version 1.2 (2009-06-11)
*
*/
package org.openoffice.da.comp.w2lcommon.tex.tokenizer;
/** This enumerates possible TeX tokens. According to chapter 7 in
* "The TeX book", a token is either a character with an associated
* catcode or a control sequence. We add "end of input" token as
* a convenience. Not all catcodes can actually end up in a token,
* so we only include the relevant ones.
*/
public enum TokenType {
ESCAPE,
BEGIN_GROUP,
END_GROUP,
MATH_SHIFT,
ALIGNMENT_TAB,
PARAMETER,
SUPERSCRIPT,
SUBSCRIPT,
SPACE,
LETTER,
OTHER,
ACTIVE,
COMMAND_SEQUENCE,
ENDINPUT;
}

View file

@ -94,7 +94,7 @@ public class XhtmlUNOPublisher extends UNOPublisher {
}
}
MessageBox msgBox = new MessageBox(xContext, xFrame);
msgBox.showMessage("Writer2xhtml","Error: Failed to open exported document");
msgBox.showMessage("w2phtml","Error: Failed to open exported document");
}
// Open the file in the default application on this system (if any)

View file

@ -1,3 +1,3 @@
#Wed Mar 11 11:08:58 CET 2020
#Wed Mar 11 12:00:39 CET 2020
releaseVersion=0.5.5
releaseDate=11\:08\:58 11-03-2020
releaseDate=12\:00\:39 11-03-2020

View file

@ -1,4 +1,4 @@
package writer2latex.rdf;
package w2phtml.rdf;
import static org.junit.Assert.*;
@ -6,7 +6,7 @@ import java.util.Vector;
import org.junit.Test;
import writer2latex.xhtml.XhtmlDocument;
import w2phtml.xhtml.XhtmlDocument;
public class DocumentStructureTests {

View file

@ -1,4 +1,4 @@
package writer2latex.rdf;
package w2phtml.rdf;
import static org.junit.Assert.*;