/************************************************************************ * * DeTeXtive.java * * Copyright: 2002-2014 by Henrik Just * * This file is part of Writer2LaTeX. * * Writer2LaTeX is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Writer2LaTeX is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Writer2LaTeX. If not, see . * * Version 1.4 (2014-09-24) * */ package org.openoffice.da.comp.writer2latex; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; import java.util.HashSet; import org.openoffice.da.comp.w2lcommon.tex.tokenizer.Mouth; import org.openoffice.da.comp.w2lcommon.tex.tokenizer.Token; import org.openoffice.da.comp.w2lcommon.tex.tokenizer.TokenType; /** This class analyzes a stream and detects if it is a TeX stream. * Currently it is able to identify LaTeX and XeLaTeX (ConTeXt and plain TeX may be * added later). */ public class DeTeXtive { private Mouth mouth; private Token token; private HashSet packages; /** Construct a new DeTeXtive */ public DeTeXtive() { } /** Detect the format of a given stream * * @param is the input stream * @return a string representing the detected format; null if the format is unknown. * Currently the values "LaTeX", "XeLaTeX" are supported. * @throws IOException if we fail to read the stream */ public String deTeXt(InputStream is) throws IOException { // It makes no harm to assume that the stream uses ISO Latin1 - we only consider ASCII characters mouth = new Mouth(new InputStreamReader(is,"ISO8859_1")); token = mouth.getTokenObject(); packages = new HashSet(); mouth.getToken(); if (parseHeader() && parsePreamble()) { if (packages.contains("xunicode")) { return "XeLaTeX"; } else { return "LaTeX"; } } // Unknown format return null; } // The parser! // Parse a LaTeX header such as \documentclass[a4paper]{article} // Return true in case of success private boolean parseHeader() throws IOException { skipBlanks(); if (token.isCS("documentclass") || token.isCS("documentstyle")) { // The first non-blank token is \documentclass or \documentstyle => could be a LaTeX document //System.out.println("** Found "+token.toString()); mouth.getToken(); skipSpaces(); // Skip options, if any if (token.is('[',TokenType.OTHER)) { skipOptional(); skipSpaces(); } if (token.getType()==TokenType.BEGIN_GROUP) { // Get class name String sClassName = parseArgumentAsString(); //System.out.println("** Found the class name "+sClassName); // Accept any class name of one or more characters if (sClassName.length()>0) { return true; } } } //System.out.println("** Doesn't look like LaTeX; failed to get class name"); return false; } // Parse a LaTeX preamble // Return true in case of success (that is, \begin{document} was found) private boolean parsePreamble() throws IOException { while (token.getType()!=TokenType.ENDINPUT) { if (token.isCS("usepackage")) { // We collect the names of all used packages, but discard their options // (Recall that this is only relevant for LaTeX 2e) mouth.getToken(); skipSpaces(); if (token.is('[',TokenType.OTHER)) { skipOptional(); skipSpaces(); } String sName = parseArgumentAsString(); //System.out.println("** Found package "+sName); packages.add(sName); } else if (token.getType()==TokenType.BEGIN_GROUP) { // We ignore anything inside a group skipGroup(); } else if (token.isCS("begin")) { // This would usually indicate the end of the preamble mouth.getToken(); skipSpaces(); if ("document".equals(parseArgumentAsString())) { //System.out.println("Found \\begin{document}"); return true; } } else { // Any other content in the preamble is simply ignored mouth.getToken(); } } //System.out.println("** Doesn't look like LaTeX; failed to find \\begin{document}"); return false; } private void skipBlanks() throws IOException { while (token.getType()==TokenType.SPACE || token.isCS("par")) { mouth.getToken(); } } private void skipSpaces() throws IOException { // Actually, we will never get two space tokens in a row while (token.getType()==TokenType.SPACE) { mouth.getToken(); } } private void skipOptional() throws IOException { assert token.is('[', TokenType.OTHER); mouth.getToken(); // skip the [ while (!token.is(']',TokenType.OTHER) && token.getType()!=TokenType.ENDINPUT) { if (token.getType()==TokenType.BEGIN_GROUP) { skipGroup(); } else { mouth.getToken(); // skip this token } } mouth.getToken(); // skip the ] } private void skipGroup() throws IOException { assert token.getType()==TokenType.BEGIN_GROUP; mouth.getToken(); // skip the { while (token.getType()!=TokenType.END_GROUP && token.getType()!=TokenType.ENDINPUT) { if (token.getType()==TokenType.BEGIN_GROUP) { skipGroup(); } else { mouth.getToken(); // skip this token } } mouth.getToken(); // skip the } } private String parseArgumentAsString() throws IOException { if (token.getType()==TokenType.BEGIN_GROUP) { // Argument is contained in a group mouth.getToken(); // skip the { StringBuilder sb = new StringBuilder(); while (token.getType()!=TokenType.END_GROUP && token.getType()!=TokenType.ENDINPUT) { if (token.getType()!=TokenType.COMMAND_SEQUENCE) { // should not include cs, ignore if it happens sb.append(token.getChar()); } mouth.getToken(); } mouth.getToken(); // skip the } return sb.toString(); } else { // Argument is a single token String s = ""; if (token.getType()!=TokenType.COMMAND_SEQUENCE) { // should not include cs, ignore if it happens s = token.getString(); } mouth.getToken(); return s; } } }