w2phtml/source/java/writer2latex/office/OfficeDocument.java
henrikjust ecacd13bce Optimized the loading of ODF files (the old xmerge code is completely rewritten)
git-svn-id: svn://svn.code.sf.net/p/writer2latex/code/trunk@144 f0f2a975-2e09-46c8-9428-3b39399b9f3c
2012-03-27 08:31:31 +00:00

349 lines
10 KiB
Java

/************************************************************************
*
* OfficeDocument.java
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*
* Copyright: 2002-2012 by Henrik Just
*
* All Rights Reserved.
*
* Version 1.4 (2012-03-27)
*
*/
package writer2latex.office;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.w3c.dom.Element;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import writer2latex.util.SimpleSAXHandler;
import writer2latex.util.SimpleZipReader;
/**
* This class implements reading of ODF files from various sources
*/
public class OfficeDocument {
// File names for the XML streams in a package document
protected final static String CONTENTXML = "content.xml";
protected final static String STYLESXML = "styles.xml";
protected final static String SETTINGSXML = "settings.xml";
private final static String METAXML = "meta.xml";
private final static String MANIFESTXML = "META-INF/manifest.xml";
// Some tag and attribute names in manifest.xml
private final static String MANIFEST_FILE_ENTRY = "manifest:file-entry";
private final static String MANIFEST_MEDIA_TYPE = "manifest:media-type";
private final static String MANIFEST_FULL_PATH = "manifest:full-path";
/** DOM <code>Document</code> of content.xml. */
private Document contentDoc = null;
/** DOM <code>Document</code> of meta.xml. */
private Document metaDoc = null;
/** DOM <code>Document</code> of settings.xml. */
private Document settingsDoc = null;
/** DOM <code>Document</code> of content.xml. */
private Document styleDoc = null;
/** DOM <code>Document</code> of META-INF/manifest.xml. */
private Document manifestDoc = null;
/** <code>SimpleZipReader</code> to store the contents from the <code>InputStream</code>
* if the document is in package format (otherwise this will remain null)
*/
private SimpleZipReader zip = null;
/** Collection to keep track of the embedded objects in the document. */
private Map<String, EmbeddedObject> embeddedObjects = null;
/** Package or flat format?
* @return true if the document is in package format, false if it's flat XML
*/
public boolean isPackageFormat() {
return zip!=null;
}
/**
* Return a DOM <code>Document</code> object of the content.xml file.
* file. Note that a content DOM is not created when the constructor
* is called, but only after the <code>read</code> method has been invoked
*
* @return DOM <code>Document</code> object.
*/
public Document getContentDOM() {
return contentDoc;
}
/**
* Return a DOM <code>Document</code> object of the meta.xml
* file. Note that a meta DOM is not created when the constructor
* is called, but only after the <code>read</code> method has been invoked
*
* @return DOM <code>Document</code> object.
*/
public Document getMetaDOM() {
return metaDoc;
}
/**
* Return a DOM <code>Document</code> object of the settings.xml
* file. Note that a settings DOM is not created when the constructor
* is called, but only after the <code>read</code> method has been invoked
*
* @return DOM <code>Document</code> object.
*/
public Document getSettingsDOM() {
return settingsDoc;
}
/**
* Return a DOM <code>Document</code> object of the style.xml file.
* Note that a style DOM is not created when the constructor
* is called, but only after the <code>read</code> method has been invoked
*
* @return DOM <code>Document</code> object.
*/
public Document getStyleDOM() {
return styleDoc;
}
/**
* Returns all the embedded objects (graphics, formulae, etc.) present in
* this document. If the document is read from flat XML there will be no embedded objects.
*
* @return An <code>Iterator</code> of <code>EmbeddedObject</code> objects.
*/
public Iterator<EmbeddedObject> getEmbeddedObjects() {
if (embeddedObjects == null) {
embeddedObjects = new HashMap<String, EmbeddedObject>();
if (manifestDoc != null) {
// Need to read the manifest file and construct a list of objects
NodeList nl = manifestDoc.getElementsByTagName(MANIFEST_FILE_ENTRY);
int nLen = nl.getLength();
for (int i = 0; i < nLen; i++) {
Element elm = (Element) nl.item(i);
String sType = elm.getAttribute(MANIFEST_MEDIA_TYPE);
String sPath = elm.getAttribute(MANIFEST_FULL_PATH);
/* According to the ODF spec there are only two types of embedded object:
* Objects with an XML representation.
* Objects without an XML representation.
* The former are represented by one or more XML files.
* The latter are in binary form.
*/
if (sType.startsWith("application/vnd.oasis.opendocument") || sType.startsWith("application/vnd.sun.xml")) {
// Allow either ODF or old OOo 1.x embedded objects
if (!sPath.equals("/")) { // Exclude the main document entries
if (sPath.endsWith("/")) { // Remove trailing slash
sPath=sPath.substring(0, sPath.length()-1);
}
embeddedObjects.put(sPath, new EmbeddedXMLObject(sPath, sType, zip));
}
}
else if (!sType.equals("text/xml")) {
// XML entries are either embedded ODF doc entries or main document entries, all other
// entries are included as binary objects
embeddedObjects.put(sPath, new EmbeddedBinaryObject(sPath, sType, zip));
}
}
}
}
return embeddedObjects.values().iterator();
}
/**
* Returns the embedded object corresponding to the name provided.
* The name should be stripped of any preceding path characters, such as
* '/', '.' or '#'.
*
* @param sName The name of the embedded object to retrieve.
*
* @return An <code>EmbeddedObject</code> instance representing the named
* object.
*/
public EmbeddedObject getEmbeddedObject(String sName) {
if (sName == null) {
return null;
}
getEmbeddedObjects();
if (embeddedObjects.containsKey(sName)) {
return embeddedObjects.get(sName);
}
return null;
}
/**
* Read the document from a DOM tree (flat XML format)
*
* @param dom the DOM tree
*/
public void read(org.w3c.dom.Document dom) {
contentDoc = dom;
styleDoc = null;
settingsDoc = null;
metaDoc = null;
manifestDoc = null;
zip=null;
embeddedObjects = null;
}
/**
* Read the Office <code>Document</code> from the given
* <code>InputStream</code>.
* Performs simple type detection to determine package or flat format
*
* @param is Office document <code>InputStream</code>.
*
* @throws IOException If any I/O error occurs.
*/
public void read(InputStream is) throws IOException {
// We need to read 4 bytes ahead to detect flat or zip format
BufferedInputStream inbuf = new BufferedInputStream(is);
byte[] bytes = new byte[4];
inbuf.mark(4);
inbuf.read(bytes);
inbuf.reset();
boolean bZip = MIMETypes.ZIP.equals(MIMETypes.getMagicMIMEType(bytes));
if (bZip) {
readZip(inbuf);
}
else {
readFlat(inbuf);
}
}
private void readZip(InputStream is) throws IOException {
zip = new SimpleZipReader();
zip.read(is);
byte contentBytes[] = zip.getEntry(CONTENTXML);
if (contentBytes == null) {
throw new IOException("Entry content.xml not found in file");
}
try {
contentDoc = parse(contentBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
byte styleBytes[] = zip.getEntry(STYLESXML);
if (styleBytes != null) {
try {
styleDoc = parse(styleBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
}
byte metaBytes[] = zip.getEntry(METAXML);
if (metaBytes != null) {
try {
metaDoc = parse(metaBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
}
byte settingsBytes[] = zip.getEntry(SETTINGSXML);
if (settingsBytes != null) {
try {
settingsDoc = parse(settingsBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
}
byte manifestBytes[] = zip.getEntry(MANIFESTXML);
if (manifestBytes != null) {
try {
manifestDoc = parse(manifestBytes);
} catch (SAXException ex) {
throw new IOException(ex);
}
}
}
private void readFlat(InputStream is) throws IOException {
SAXParserFactory factory=SAXParserFactory.newInstance();
SimpleSAXHandler handler = new SimpleSAXHandler();
try {
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(is,handler);
}
catch (SAXException e){
System.err.println("Oops - Error parsing document");
e.printStackTrace();
}
catch (ParserConfigurationException e) {
System.err.println("Oops - failed to get XML parser!?");
e.printStackTrace();
}
contentDoc = handler.getDOM();
styleDoc = null;
settingsDoc = null;
metaDoc = null;
manifestDoc = null;
zip=null;
embeddedObjects = null;
}
/**
* Parse given <code>byte</code> array into a DOM
* <code>Document</code> object using the
* <code>DocumentBuilder</code> object.
*
* @param builder <code>DocumentBuilder</code> object for parsing.
* @param bytes <code>byte</code> array for parsing.
*
* @return Resulting DOM <code>Document</code> object.
*
* @throws SAXException If any parsing error occurs.
*/
static Document parse(byte bytes[]) throws SAXException, IOException {
SAXParserFactory factory=SAXParserFactory.newInstance();
SimpleSAXHandler handler = new SimpleSAXHandler();
try {
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(new ByteArrayInputStream(bytes),handler);
return handler.getDOM();
}
catch (ParserConfigurationException e) {
System.err.println("Oops - failed to get XML parser!?");
e.printStackTrace();
}
return null;
}
}