/************************************************************************ * * OfficeDocument.java * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA * * Copyright: 2002-2012 by Henrik Just * * All Rights Reserved. * * Version 1.4 (2012-03-27) * */ package writer2latex.office; import java.io.BufferedInputStream; import java.io.InputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Iterator; import java.util.Map; import java.util.HashMap; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.w3c.dom.Element; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import writer2latex.util.SimpleSAXHandler; import writer2latex.util.SimpleZipReader; /** * This class implements reading of ODF files from various sources */ public class OfficeDocument { // File names for the XML streams in a package document protected final static String CONTENTXML = "content.xml"; protected final static String STYLESXML = "styles.xml"; protected final static String SETTINGSXML = "settings.xml"; private final static String METAXML = "meta.xml"; private final static String MANIFESTXML = "META-INF/manifest.xml"; // Some tag and attribute names in manifest.xml private final static String MANIFEST_FILE_ENTRY = "manifest:file-entry"; private final static String MANIFEST_MEDIA_TYPE = "manifest:media-type"; private final static String MANIFEST_FULL_PATH = "manifest:full-path"; /** DOM Document of content.xml. */ private Document contentDoc = null; /** DOM Document of meta.xml. */ private Document metaDoc = null; /** DOM Document of settings.xml. */ private Document settingsDoc = null; /** DOM Document of content.xml. */ private Document styleDoc = null; /** DOM Document of META-INF/manifest.xml. */ private Document manifestDoc = null; /** SimpleZipReader to store the contents from the InputStream * if the document is in package format (otherwise this will remain null) */ private SimpleZipReader zip = null; /** Collection to keep track of the embedded objects in the document. */ private Map embeddedObjects = null; /** Package or flat format? * @return true if the document is in package format, false if it's flat XML */ public boolean isPackageFormat() { return zip!=null; } /** * Return a DOM Document object of the content.xml file. * file. Note that a content DOM is not created when the constructor * is called, but only after the read method has been invoked * * @return DOM Document object. */ public Document getContentDOM() { return contentDoc; } /** * Return a DOM Document object of the meta.xml * file. Note that a meta DOM is not created when the constructor * is called, but only after the read method has been invoked * * @return DOM Document object. */ public Document getMetaDOM() { return metaDoc; } /** * Return a DOM Document object of the settings.xml * file. Note that a settings DOM is not created when the constructor * is called, but only after the read method has been invoked * * @return DOM Document object. */ public Document getSettingsDOM() { return settingsDoc; } /** * Return a DOM Document object of the style.xml file. * Note that a style DOM is not created when the constructor * is called, but only after the read method has been invoked * * @return DOM Document object. */ public Document getStyleDOM() { return styleDoc; } /** * Returns all the embedded objects (graphics, formulae, etc.) present in * this document. If the document is read from flat XML there will be no embedded objects. * * @return An Iterator of EmbeddedObject objects. */ public Iterator getEmbeddedObjects() { if (embeddedObjects == null) { embeddedObjects = new HashMap(); if (manifestDoc != null) { // Need to read the manifest file and construct a list of objects NodeList nl = manifestDoc.getElementsByTagName(MANIFEST_FILE_ENTRY); int nLen = nl.getLength(); for (int i = 0; i < nLen; i++) { Element elm = (Element) nl.item(i); String sType = elm.getAttribute(MANIFEST_MEDIA_TYPE); String sPath = elm.getAttribute(MANIFEST_FULL_PATH); /* According to the ODF spec there are only two types of embedded object: * Objects with an XML representation. * Objects without an XML representation. * The former are represented by one or more XML files. * The latter are in binary form. */ if (sType.startsWith("application/vnd.oasis.opendocument") || sType.startsWith("application/vnd.sun.xml")) { // Allow either ODF or old OOo 1.x embedded objects if (!sPath.equals("/")) { // Exclude the main document entries if (sPath.endsWith("/")) { // Remove trailing slash sPath=sPath.substring(0, sPath.length()-1); } embeddedObjects.put(sPath, new EmbeddedXMLObject(sPath, sType, zip)); } } else if (!sType.equals("text/xml")) { // XML entries are either embedded ODF doc entries or main document entries, all other // entries are included as binary objects embeddedObjects.put(sPath, new EmbeddedBinaryObject(sPath, sType, zip)); } } } } return embeddedObjects.values().iterator(); } /** * Returns the embedded object corresponding to the name provided. * The name should be stripped of any preceding path characters, such as * '/', '.' or '#'. * * @param sName The name of the embedded object to retrieve. * * @return An EmbeddedObject instance representing the named * object. */ public EmbeddedObject getEmbeddedObject(String sName) { if (sName == null) { return null; } getEmbeddedObjects(); if (embeddedObjects.containsKey(sName)) { return embeddedObjects.get(sName); } return null; } /** * Read the document from a DOM tree (flat XML format) * * @param dom the DOM tree */ public void read(org.w3c.dom.Document dom) { contentDoc = dom; styleDoc = null; settingsDoc = null; metaDoc = null; manifestDoc = null; zip=null; embeddedObjects = null; } /** * Read the Office Document from the given * InputStream. * Performs simple type detection to determine package or flat format * * @param is Office document InputStream. * * @throws IOException If any I/O error occurs. */ public void read(InputStream is) throws IOException { // We need to read 4 bytes ahead to detect flat or zip format BufferedInputStream inbuf = new BufferedInputStream(is); byte[] bytes = new byte[4]; inbuf.mark(4); inbuf.read(bytes); inbuf.reset(); boolean bZip = MIMETypes.ZIP.equals(MIMETypes.getMagicMIMEType(bytes)); if (bZip) { readZip(inbuf); } else { readFlat(inbuf); } } private void readZip(InputStream is) throws IOException { zip = new SimpleZipReader(); zip.read(is); byte contentBytes[] = zip.getEntry(CONTENTXML); if (contentBytes == null) { throw new IOException("Entry content.xml not found in file"); } try { contentDoc = parse(contentBytes); } catch (SAXException ex) { throw new IOException(ex); } byte styleBytes[] = zip.getEntry(STYLESXML); if (styleBytes != null) { try { styleDoc = parse(styleBytes); } catch (SAXException ex) { throw new IOException(ex); } } byte metaBytes[] = zip.getEntry(METAXML); if (metaBytes != null) { try { metaDoc = parse(metaBytes); } catch (SAXException ex) { throw new IOException(ex); } } byte settingsBytes[] = zip.getEntry(SETTINGSXML); if (settingsBytes != null) { try { settingsDoc = parse(settingsBytes); } catch (SAXException ex) { throw new IOException(ex); } } byte manifestBytes[] = zip.getEntry(MANIFESTXML); if (manifestBytes != null) { try { manifestDoc = parse(manifestBytes); } catch (SAXException ex) { throw new IOException(ex); } } } private void readFlat(InputStream is) throws IOException { SAXParserFactory factory=SAXParserFactory.newInstance(); SimpleSAXHandler handler = new SimpleSAXHandler(); try { SAXParser saxParser = factory.newSAXParser(); saxParser.parse(is,handler); } catch (SAXException e){ System.err.println("Oops - Error parsing document"); e.printStackTrace(); } catch (ParserConfigurationException e) { System.err.println("Oops - failed to get XML parser!?"); e.printStackTrace(); } contentDoc = handler.getDOM(); styleDoc = null; settingsDoc = null; metaDoc = null; manifestDoc = null; zip=null; embeddedObjects = null; } /** * Parse given byte array into a DOM * Document object using the * DocumentBuilder object. * * @param builder DocumentBuilder object for parsing. * @param bytes byte array for parsing. * * @return Resulting DOM Document object. * * @throws SAXException If any parsing error occurs. */ static Document parse(byte bytes[]) throws SAXException, IOException { SAXParserFactory factory=SAXParserFactory.newInstance(); SimpleSAXHandler handler = new SimpleSAXHandler(); try { SAXParser saxParser = factory.newSAXParser(); saxParser.parse(new ByteArrayInputStream(bytes),handler); return handler.getDOM(); } catch (ParserConfigurationException e) { System.err.println("Oops - failed to get XML parser!?"); e.printStackTrace(); } return null; } }