Optimized the loading of ODF files (the old xmerge code is completely rewritten)

git-svn-id: svn://svn.code.sf.net/p/writer2latex/code/trunk@144 f0f2a975-2e09-46c8-9428-3b39399b9f3c
This commit is contained in:
henrikjust 2012-03-27 08:31:31 +00:00
parent e3a808f820
commit ecacd13bce
26 changed files with 1215 additions and 2603 deletions

View file

@ -0,0 +1,60 @@
/************************************************************************
*
* EmbeddedBinaryObject.java
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
* Copyright: 2002-2012 by Henrik Just
*
* All Rights Reserved.
*
* Version 1.4 (2012-03-26)
*
*/
package writer2latex.office;
import writer2latex.util.SimpleZipReader;
/**
* This class represents an embedded object with a binary representation in an ODF package document
*/
public class EmbeddedBinaryObject extends EmbeddedObject {
/** The object's binary representation. */
private byte[] objData = null;
/**
* Package private constructor for use when reading an object from a
* package ODF file
*
* @param name The name of the object.
* @param type The MIME-type of the object.
* @param source A <code>SimpleZipReader</code> containing the object
*/
protected EmbeddedBinaryObject(String sName, String sType, SimpleZipReader source) {
super(sName,sType);
objData = source.getEntry(sName);
}
/** Get the binary data for this object
*
* @return A <code>byte</code> array containing the object's data.
*/
public byte[] getBinaryData() {
return objData;
}
}

View file

@ -0,0 +1,61 @@
/************************************************************************
*
* EmbeddedObject.java
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
* Copyright: 2002-2012 by Henrik Just
*
* All Rights Reserved.
*
* Version 1.4 (2012-03-27)
*
*/
package writer2latex.office;
/** This class represents and embedded object within an ODF package document
*/
public abstract class EmbeddedObject {
private String sName;
private String sType;
/** Construct a new embedded object
*
* @param sName The name of the object.
* @param sType The MIME-type of the object.
*/
protected EmbeddedObject(String name, String type) {
sName = name;
sType = type;
}
/** Get the name of the embedded object represented by this instance.
* The name refers to the manifest.xml file
*
* @return The name of the object.
*/
public final String getName() {
return sName;
}
/** Get the MIME type of the embedded object represented by this instance.
* The MIME type refers to the manifest.xml file
*/
public final String getType() {
return sType;
}
}

View file

@ -0,0 +1,122 @@
/************************************************************************
*
* EmbeddedXMLObject.java
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
* Copyright: 2002-2012 by Henrik Just
*
* All Rights Reserved.
*
* Version 1.4 (2012-03-27)
*
*/
package writer2latex.office;
import java.io.IOException;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import writer2latex.util.SimpleZipReader;
/** This class represents those embedded objects in an ODF document that have an XML representation:
* Formulas, charts, spreadsheets, text, drawings and presentations.
* These object types are stored using a combination of content, settings and styles XML files.
*/
public class EmbeddedXMLObject extends EmbeddedObject {
// Byte entries for the XML streams of this object
private byte[] contentBytes = null;
private byte[] settingsBytes = null;
private byte[] stylesBytes = null;
// DOM trees representing the XML parts of this object
protected Document contentDOM = null;
protected Document settingsDOM = null;
protected Document stylesDOM = null;
/** Read an object from an ODF package document
*
* @param sName The name of the object.
* @param sType The MIME-type of the object.
* @param source A ZIP reader providing the contents of the package
*/
protected EmbeddedXMLObject(String sName, String sType, SimpleZipReader source) {
super(sName, sType);
// Read the bytes, but defer parsing until required (at that point, the bytes are nullified)
contentBytes = source.getEntry(sName+"/"+OfficeDocument.CONTENTXML);
settingsBytes = source.getEntry(sName+"/"+OfficeDocument.SETTINGSXML);
stylesBytes = source.getEntry(sName+"/"+OfficeDocument.STYLESXML);
}
/**
* Returns the content data for this embedded object.
*
* @return DOM representation of "content.xml"
*
* @throws SAXException If any parser error occurs
* @throws IOException If any IO error occurs
*/
public Document getContentDOM() throws SAXException, IOException {
if (contentDOM==null) {
contentDOM=getDOM(contentBytes);
contentBytes=null;
}
return contentDOM;
}
/**
* Returns the settings data for this embedded object.
*
* @return DOM representation of "settings.xml"
*
* @throws SAXException If any parser error occurs
* @throws IOException If any IO error occurs
*/
public Document getSettingsDOM() throws SAXException, IOException {
if (settingsDOM==null) {
settingsDOM=getDOM(settingsBytes);
settingsBytes=null;
}
return settingsDOM;
}
/**
* Returns the style data for this embedded object.
*
* @return DOM representation of "styles.xml"
*
* @throws SAXException If any parser error occurs
* @throws IOException If any IO error occurs
*/
public Document getStylesDOM() throws SAXException, IOException {
if (stylesDOM==null) {
stylesDOM = getDOM(stylesBytes);
stylesBytes=null;
}
return stylesDOM;
}
private Document getDOM(byte[] data) throws SAXException, IOException {
if (data!=null) {
return OfficeDocument.parse(data);
}
return null;
}
}

View file

@ -40,9 +40,6 @@ import writer2latex.api.GraphicConverter;
import writer2latex.util.Base64;
import writer2latex.util.Misc;
import writer2latex.xmerge.BinaryGraphicsDocument;
import writer2latex.xmerge.EmbeddedObject;
import writer2latex.xmerge.EmbeddedBinaryObject;
import writer2latex.xmerge.OfficeDocument;
//import writer2latex.util.*;

View file

@ -36,7 +36,6 @@ import org.w3c.dom.NodeList;
import writer2latex.util.*;
//import writer2latex.office.*;
import writer2latex.xmerge.OfficeDocument;
/**
* <p>This class represents the metadata of an OOo Writer document.</p>

View file

@ -0,0 +1,349 @@
/************************************************************************
*
* OfficeDocument.java
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
* Copyright: 2002-2012 by Henrik Just
*
* All Rights Reserved.
*
* Version 1.4 (2012-03-27)
*
*/
package writer2latex.office;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.w3c.dom.Element;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import writer2latex.util.SimpleSAXHandler;
import writer2latex.util.SimpleZipReader;
/**
* This class implements reading of ODF files from various sources
*/
public class OfficeDocument {
// File names for the XML streams in a package document
protected final static String CONTENTXML = "content.xml";
protected final static String STYLESXML = "styles.xml";
protected final static String SETTINGSXML = "settings.xml";
private final static String METAXML = "meta.xml";
private final static String MANIFESTXML = "META-INF/manifest.xml";
// Some tag and attribute names in manifest.xml
private final static String MANIFEST_FILE_ENTRY = "manifest:file-entry";
private final static String MANIFEST_MEDIA_TYPE = "manifest:media-type";
private final static String MANIFEST_FULL_PATH = "manifest:full-path";
/** DOM <code>Document</code> of content.xml. */
private Document contentDoc = null;
/** DOM <code>Document</code> of meta.xml. */
private Document metaDoc = null;
/** DOM <code>Document</code> of settings.xml. */
private Document settingsDoc = null;
/** DOM <code>Document</code> of content.xml. */
private Document styleDoc = null;
/** DOM <code>Document</code> of META-INF/manifest.xml. */
private Document manifestDoc = null;
/** <code>SimpleZipReader</code> to store the contents from the <code>InputStream</code>
* if the document is in package format (otherwise this will remain null)
*/
private SimpleZipReader zip = null;
/** Collection to keep track of the embedded objects in the document. */
private Map<String, EmbeddedObject> embeddedObjects = null;
/** Package or flat format?
* @return true if the document is in package format, false if it's flat XML
*/
public boolean isPackageFormat() {
return zip!=null;
}
/**
* Return a DOM <code>Document</code> object of the content.xml file.
* file. Note that a content DOM is not created when the constructor
* is called, but only after the <code>read</code> method has been invoked
*
* @return DOM <code>Document</code> object.
*/
public Document getContentDOM() {
return contentDoc;
}
/**
* Return a DOM <code>Document</code> object of the meta.xml
* file. Note that a meta DOM is not created when the constructor
* is called, but only after the <code>read</code> method has been invoked
*
* @return DOM <code>Document</code> object.
*/
public Document getMetaDOM() {
return metaDoc;
}
/**
* Return a DOM <code>Document</code> object of the settings.xml
* file. Note that a settings DOM is not created when the constructor
* is called, but only after the <code>read</code> method has been invoked
*
* @return DOM <code>Document</code> object.
*/
public Document getSettingsDOM() {
return settingsDoc;
}
/**
* Return a DOM <code>Document</code> object of the style.xml file.
* Note that a style DOM is not created when the constructor
* is called, but only after the <code>read</code> method has been invoked
*
* @return DOM <code>Document</code> object.
*/
public Document getStyleDOM() {
return styleDoc;
}
/**
* Returns all the embedded objects (graphics, formulae, etc.) present in
* this document. If the document is read from flat XML there will be no embedded objects.
*
* @return An <code>Iterator</code> of <code>EmbeddedObject</code> objects.
*/
public Iterator<EmbeddedObject> getEmbeddedObjects() {
if (embeddedObjects == null) {
embeddedObjects = new HashMap<String, EmbeddedObject>();
if (manifestDoc != null) {
// Need to read the manifest file and construct a list of objects
NodeList nl = manifestDoc.getElementsByTagName(MANIFEST_FILE_ENTRY);
int nLen = nl.getLength();
for (int i = 0; i < nLen; i++) {
Element elm = (Element) nl.item(i);
String sType = elm.getAttribute(MANIFEST_MEDIA_TYPE);
String sPath = elm.getAttribute(MANIFEST_FULL_PATH);
/* According to the ODF spec there are only two types of embedded object:
* Objects with an XML representation.
* Objects without an XML representation.
* The former are represented by one or more XML files.
* The latter are in binary form.
*/
if (sType.startsWith("application/vnd.oasis.opendocument") || sType.startsWith("application/vnd.sun.xml")) {
// Allow either ODF or old OOo 1.x embedded objects
if (!sPath.equals("/")) { // Exclude the main document entries
if (sPath.endsWith("/")) { // Remove trailing slash
sPath=sPath.substring(0, sPath.length()-1);
}
embeddedObjects.put(sPath, new EmbeddedXMLObject(sPath, sType, zip));
}
}
else if (!sType.equals("text/xml")) {
// XML entries are either embedded ODF doc entries or main document entries, all other
// entries are included as binary objects
embeddedObjects.put(sPath, new EmbeddedBinaryObject(sPath, sType, zip));
}
}
}
}
return embeddedObjects.values().iterator();
}
/**
* Returns the embedded object corresponding to the name provided.
* The name should be stripped of any preceding path characters, such as
* '/', '.' or '#'.
*
* @param sName The name of the embedded object to retrieve.
*
* @return An <code>EmbeddedObject</code> instance representing the named
* object.
*/
public EmbeddedObject getEmbeddedObject(String sName) {
if (sName == null) {
return null;
}
getEmbeddedObjects();
if (embeddedObjects.containsKey(sName)) {
return embeddedObjects.get(sName);
}
return null;
}
/**
* Read the document from a DOM tree (flat XML format)
*
* @param dom the DOM tree
*/
public void read(org.w3c.dom.Document dom) {
contentDoc = dom;
styleDoc = null;
settingsDoc = null;
metaDoc = null;
manifestDoc = null;
zip=null;
embeddedObjects = null;
}
/**
* Read the Office <code>Document</code> from the given
* <code>InputStream</code>.
* Performs simple type detection to determine package or flat format
*
* @param is Office document <code>InputStream</code>.
*
* @throws IOException If any I/O error occurs.
*/
public void read(InputStream is) throws IOException {
// We need to read 4 bytes ahead to detect flat or zip format
BufferedInputStream inbuf = new BufferedInputStream(is);
byte[] bytes = new byte[4];
inbuf.mark(4);
inbuf.read(bytes);
inbuf.reset();
boolean bZip = MIMETypes.ZIP.equals(MIMETypes.getMagicMIMEType(bytes));
if (bZip) {
readZip(inbuf);
}
else {
readFlat(inbuf);
}
}
private void readZip(InputStream is) throws IOException {
zip = new SimpleZipReader();
zip.read(is);
byte contentBytes[] = zip.getEntry(CONTENTXML);
if (contentBytes == null) {
throw new IOException("Entry content.xml not found in file");
}
try {
contentDoc = parse(contentBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
byte styleBytes[] = zip.getEntry(STYLESXML);
if (styleBytes != null) {
try {
styleDoc = parse(styleBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
}
byte metaBytes[] = zip.getEntry(METAXML);
if (metaBytes != null) {
try {
metaDoc = parse(metaBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
}
byte settingsBytes[] = zip.getEntry(SETTINGSXML);
if (settingsBytes != null) {
try {
settingsDoc = parse(settingsBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
}
byte manifestBytes[] = zip.getEntry(MANIFESTXML);
if (manifestBytes != null) {
try {
manifestDoc = parse(manifestBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
}
}
private void readFlat(InputStream is) throws IOException {
SAXParserFactory factory=SAXParserFactory.newInstance();
SimpleSAXHandler handler = new SimpleSAXHandler();
try {
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(is,handler);
}
catch (SAXException e){
System.err.println("Oops - Error parsing document");
e.printStackTrace();
}
catch (ParserConfigurationException e) {
System.err.println("Oops - failed to get XML parser!?");
e.printStackTrace();
}
contentDoc = handler.getDOM();
styleDoc = null;
settingsDoc = null;
metaDoc = null;
manifestDoc = null;
zip=null;
embeddedObjects = null;
}
/**
* Parse given <code>byte</code> array into a DOM
* <code>Document</code> object using the
* <code>DocumentBuilder</code> object.
*
* @param builder <code>DocumentBuilder</code> object for parsing.
* @param bytes <code>byte</code> array for parsing.
*
* @return Resulting DOM <code>Document</code> object.
*
* @throws SAXException If any parsing error occurs.
*/
static Document parse(byte bytes[]) throws SAXException, IOException {
SAXParserFactory factory=SAXParserFactory.newInstance();
SimpleSAXHandler handler = new SimpleSAXHandler();
try {
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(new ByteArrayInputStream(bytes),handler);
return handler.getDOM();
}
catch (ParserConfigurationException e) {
System.err.println("Oops - failed to get XML parser!?");
e.printStackTrace();
}
return null;
}
}

View file

@ -37,7 +37,6 @@ import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Document;
import writer2latex.xmerge.OfficeDocument;
import writer2latex.util.Misc;
/** <p> This class reads and collects global information about an OOo document.