2012-03-27 08:31:31 +00:00
|
|
|
/************************************************************************
|
|
|
|
*
|
|
|
|
* OfficeDocument.java
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License version 2.1, as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with this library; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
|
|
|
|
* MA 02111-1307 USA
|
|
|
|
*
|
2014-08-25 06:25:05 +00:00
|
|
|
* Copyright: 2002-2014 by Henrik Just
|
2012-03-27 08:31:31 +00:00
|
|
|
*
|
|
|
|
* All Rights Reserved.
|
|
|
|
*
|
2014-09-03 07:04:31 +00:00
|
|
|
* Version 1.4 (2014-08-28)
|
2012-03-27 08:31:31 +00:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
package writer2latex.office;
|
|
|
|
|
|
|
|
import java.io.BufferedInputStream;
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
|
|
|
import javax.xml.parsers.ParserConfigurationException;
|
|
|
|
import javax.xml.parsers.SAXParser;
|
|
|
|
import javax.xml.parsers.SAXParserFactory;
|
|
|
|
|
|
|
|
import org.w3c.dom.Element;
|
|
|
|
import org.w3c.dom.Document;
|
|
|
|
import org.w3c.dom.NodeList;
|
|
|
|
import org.xml.sax.SAXException;
|
|
|
|
|
2012-04-03 07:50:53 +00:00
|
|
|
import writer2latex.util.SimpleXMLParser;
|
2012-03-27 08:31:31 +00:00
|
|
|
import writer2latex.util.SimpleZipReader;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This class implements reading of ODF files from various sources
|
|
|
|
*/
|
|
|
|
public class OfficeDocument {
|
2014-08-25 06:25:05 +00:00
|
|
|
// File names for the XML streams in a package document (settings.xml is ignored)
|
2012-03-27 08:31:31 +00:00
|
|
|
protected final static String CONTENTXML = "content.xml";
|
|
|
|
protected final static String STYLESXML = "styles.xml";
|
|
|
|
private final static String METAXML = "meta.xml";
|
|
|
|
private final static String MANIFESTXML = "META-INF/manifest.xml";
|
|
|
|
|
|
|
|
// Some tag and attribute names in manifest.xml
|
|
|
|
private final static String MANIFEST_FILE_ENTRY = "manifest:file-entry";
|
|
|
|
private final static String MANIFEST_MEDIA_TYPE = "manifest:media-type";
|
|
|
|
private final static String MANIFEST_FULL_PATH = "manifest:full-path";
|
2014-08-25 06:25:05 +00:00
|
|
|
|
|
|
|
// Identify package format
|
|
|
|
private boolean bIsPackageFormat = false;
|
2012-03-27 08:31:31 +00:00
|
|
|
|
|
|
|
/** DOM <code>Document</code> of content.xml. */
|
|
|
|
private Document contentDoc = null;
|
|
|
|
|
|
|
|
/** DOM <code>Document</code> of meta.xml. */
|
|
|
|
private Document metaDoc = null;
|
|
|
|
|
|
|
|
/** DOM <code>Document</code> of content.xml. */
|
|
|
|
private Document styleDoc = null;
|
|
|
|
|
|
|
|
/** DOM <code>Document</code> of META-INF/manifest.xml. */
|
|
|
|
private Document manifestDoc = null;
|
|
|
|
|
|
|
|
/** Collection to keep track of the embedded objects in the document. */
|
|
|
|
private Map<String, EmbeddedObject> embeddedObjects = null;
|
2014-09-03 07:04:31 +00:00
|
|
|
|
2012-03-27 08:31:31 +00:00
|
|
|
/** Package or flat format?
|
|
|
|
* @return true if the document is in package format, false if it's flat XML
|
|
|
|
*/
|
|
|
|
public boolean isPackageFormat() {
|
2014-08-25 06:25:05 +00:00
|
|
|
return bIsPackageFormat;
|
2012-03-27 08:31:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return a DOM <code>Document</code> object of the content.xml file.
|
|
|
|
* file. Note that a content DOM is not created when the constructor
|
|
|
|
* is called, but only after the <code>read</code> method has been invoked
|
|
|
|
*
|
|
|
|
* @return DOM <code>Document</code> object.
|
|
|
|
*/
|
|
|
|
public Document getContentDOM() {
|
|
|
|
return contentDoc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return a DOM <code>Document</code> object of the meta.xml
|
|
|
|
* file. Note that a meta DOM is not created when the constructor
|
|
|
|
* is called, but only after the <code>read</code> method has been invoked
|
|
|
|
*
|
|
|
|
* @return DOM <code>Document</code> object.
|
|
|
|
*/
|
|
|
|
public Document getMetaDOM() {
|
|
|
|
return metaDoc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return a DOM <code>Document</code> object of the style.xml file.
|
|
|
|
* Note that a style DOM is not created when the constructor
|
|
|
|
* is called, but only after the <code>read</code> method has been invoked
|
|
|
|
*
|
|
|
|
* @return DOM <code>Document</code> object.
|
|
|
|
*/
|
|
|
|
public Document getStyleDOM() {
|
|
|
|
return styleDoc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2014-08-25 06:25:05 +00:00
|
|
|
* Collect all the embedded objects (graphics, formulae, etc.) present in
|
2012-03-27 08:31:31 +00:00
|
|
|
* this document. If the document is read from flat XML there will be no embedded objects.
|
|
|
|
*/
|
2014-08-25 06:25:05 +00:00
|
|
|
private void getEmbeddedObjects(SimpleZipReader zip) {
|
|
|
|
embeddedObjects = new HashMap<String, EmbeddedObject>();
|
|
|
|
if (manifestDoc != null) {
|
|
|
|
// Need to read the manifest file and construct a list of objects
|
|
|
|
NodeList nl = manifestDoc.getElementsByTagName(MANIFEST_FILE_ENTRY);
|
|
|
|
int nLen = nl.getLength();
|
|
|
|
for (int i = 0; i < nLen; i++) {
|
|
|
|
Element elm = (Element) nl.item(i);
|
|
|
|
String sType = elm.getAttribute(MANIFEST_MEDIA_TYPE);
|
|
|
|
String sPath = elm.getAttribute(MANIFEST_FULL_PATH);
|
|
|
|
|
|
|
|
/* According to the ODF spec there are only two types of embedded object:
|
|
|
|
* Objects with an XML representation.
|
|
|
|
* Objects without an XML representation.
|
|
|
|
* The former are represented by one or more XML files.
|
|
|
|
* The latter are in binary form.
|
|
|
|
*/
|
|
|
|
if (sType.startsWith("application/vnd.oasis.opendocument") || sType.startsWith("application/vnd.sun.xml")) {
|
|
|
|
// Allow either ODF or old OOo 1.x embedded objects
|
|
|
|
if (!sPath.equals("/")) { // Exclude the main document entries
|
|
|
|
if (sPath.endsWith("/")) { // Remove trailing slash
|
|
|
|
sPath=sPath.substring(0, sPath.length()-1);
|
2012-03-27 08:31:31 +00:00
|
|
|
}
|
2014-09-03 07:04:31 +00:00
|
|
|
embeddedObjects.put(sPath, new EmbeddedXMLObject(sPath, sType, this, zip));
|
2012-03-27 08:31:31 +00:00
|
|
|
}
|
2014-08-25 06:25:05 +00:00
|
|
|
}
|
|
|
|
else if (!sType.equals("text/xml")) {
|
|
|
|
// XML entries are either embedded ODF doc entries or main document entries, all other
|
|
|
|
// entries are included as binary objects
|
2014-09-03 07:04:31 +00:00
|
|
|
embeddedObjects.put(sPath, new EmbeddedBinaryObject(sPath, sType, this, zip));
|
2012-03-27 08:31:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns the embedded object corresponding to the name provided.
|
|
|
|
* The name should be stripped of any preceding path characters, such as
|
|
|
|
* '/', '.' or '#'.
|
|
|
|
*
|
|
|
|
* @param sName The name of the embedded object to retrieve.
|
|
|
|
*
|
|
|
|
* @return An <code>EmbeddedObject</code> instance representing the named
|
|
|
|
* object.
|
|
|
|
*/
|
|
|
|
public EmbeddedObject getEmbeddedObject(String sName) {
|
2014-08-25 06:25:05 +00:00
|
|
|
if (sName!=null && embeddedObjects!=null && embeddedObjects.containsKey(sName)) {
|
2012-03-27 08:31:31 +00:00
|
|
|
return embeddedObjects.get(sName);
|
|
|
|
}
|
|
|
|
return null;
|
|
|
|
}
|
2014-09-03 07:04:31 +00:00
|
|
|
|
|
|
|
protected void removeEmbeddedObject(String sName) {
|
|
|
|
if (sName!=null && embeddedObjects!=null && embeddedObjects.containsKey(sName)) {
|
|
|
|
embeddedObjects.remove(sName);
|
|
|
|
}
|
|
|
|
}
|
2012-03-27 08:31:31 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Read the document from a DOM tree (flat XML format)
|
|
|
|
*
|
|
|
|
* @param dom the DOM tree
|
|
|
|
*/
|
|
|
|
public void read(org.w3c.dom.Document dom) {
|
|
|
|
contentDoc = dom;
|
|
|
|
styleDoc = null;
|
|
|
|
metaDoc = null;
|
|
|
|
manifestDoc = null;
|
2014-08-25 06:25:05 +00:00
|
|
|
bIsPackageFormat = false;
|
2012-03-27 08:31:31 +00:00
|
|
|
embeddedObjects = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Read the Office <code>Document</code> from the given
|
|
|
|
* <code>InputStream</code>.
|
|
|
|
* Performs simple type detection to determine package or flat format
|
|
|
|
*
|
|
|
|
* @param is Office document <code>InputStream</code>.
|
|
|
|
*
|
|
|
|
* @throws IOException If any I/O error occurs.
|
2012-04-03 07:50:53 +00:00
|
|
|
* @throws SAXException
|
2012-03-27 08:31:31 +00:00
|
|
|
*/
|
|
|
|
public void read(InputStream is) throws IOException {
|
|
|
|
// We need to read 4 bytes ahead to detect flat or zip format
|
|
|
|
BufferedInputStream inbuf = new BufferedInputStream(is);
|
|
|
|
byte[] bytes = new byte[4];
|
|
|
|
inbuf.mark(4);
|
|
|
|
inbuf.read(bytes);
|
|
|
|
inbuf.reset();
|
|
|
|
boolean bZip = MIMETypes.ZIP.equals(MIMETypes.getMagicMIMEType(bytes));
|
|
|
|
if (bZip) {
|
|
|
|
readZip(inbuf);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
readFlat(inbuf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private void readZip(InputStream is) throws IOException {
|
2014-08-25 06:25:05 +00:00
|
|
|
SimpleZipReader zip = new SimpleZipReader();
|
2012-03-27 08:31:31 +00:00
|
|
|
zip.read(is);
|
|
|
|
|
|
|
|
byte contentBytes[] = zip.getEntry(CONTENTXML);
|
|
|
|
if (contentBytes == null) {
|
|
|
|
throw new IOException("Entry content.xml not found in file");
|
|
|
|
}
|
|
|
|
try {
|
|
|
|
contentDoc = parse(contentBytes);
|
|
|
|
} catch (SAXException ex) {
|
|
|
|
throw new IOException(ex);
|
|
|
|
}
|
|
|
|
|
|
|
|
byte styleBytes[] = zip.getEntry(STYLESXML);
|
|
|
|
if (styleBytes != null) {
|
|
|
|
try {
|
|
|
|
styleDoc = parse(styleBytes);
|
|
|
|
} catch (SAXException ex) {
|
|
|
|
throw new IOException(ex);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
byte metaBytes[] = zip.getEntry(METAXML);
|
|
|
|
if (metaBytes != null) {
|
|
|
|
try {
|
|
|
|
metaDoc = parse(metaBytes);
|
|
|
|
} catch (SAXException ex) {
|
|
|
|
throw new IOException(ex);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
byte manifestBytes[] = zip.getEntry(MANIFESTXML);
|
|
|
|
if (manifestBytes != null) {
|
|
|
|
try {
|
|
|
|
manifestDoc = parse(manifestBytes);
|
|
|
|
} catch (SAXException ex) {
|
|
|
|
throw new IOException(ex);
|
|
|
|
}
|
|
|
|
}
|
2014-08-25 06:25:05 +00:00
|
|
|
|
|
|
|
bIsPackageFormat = true;
|
|
|
|
getEmbeddedObjects(zip);
|
2012-03-27 08:31:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private void readFlat(InputStream is) throws IOException {
|
|
|
|
try {
|
2012-04-03 07:50:53 +00:00
|
|
|
contentDoc = SimpleXMLParser.parse(is);
|
|
|
|
} catch (SAXException e) {
|
|
|
|
throw new IOException(e);
|
2012-03-27 08:31:31 +00:00
|
|
|
}
|
|
|
|
styleDoc = null;
|
|
|
|
metaDoc = null;
|
|
|
|
manifestDoc = null;
|
2014-08-25 06:25:05 +00:00
|
|
|
bIsPackageFormat = false;
|
2012-03-27 08:31:31 +00:00
|
|
|
embeddedObjects = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse given <code>byte</code> array into a DOM
|
|
|
|
* <code>Document</code> object using the
|
|
|
|
* <code>DocumentBuilder</code> object.
|
|
|
|
*
|
|
|
|
* @param builder <code>DocumentBuilder</code> object for parsing.
|
|
|
|
* @param bytes <code>byte</code> array for parsing.
|
|
|
|
*
|
|
|
|
* @return Resulting DOM <code>Document</code> object.
|
|
|
|
*
|
|
|
|
* @throws SAXException If any parsing error occurs.
|
|
|
|
*/
|
|
|
|
static Document parse(byte bytes[]) throws SAXException, IOException {
|
|
|
|
SAXParserFactory factory=SAXParserFactory.newInstance();
|
2012-04-03 07:50:53 +00:00
|
|
|
SimpleXMLParser handler = new SimpleXMLParser();
|
2012-03-27 08:31:31 +00:00
|
|
|
try {
|
|
|
|
SAXParser saxParser = factory.newSAXParser();
|
|
|
|
saxParser.parse(new ByteArrayInputStream(bytes),handler);
|
|
|
|
return handler.getDOM();
|
|
|
|
}
|
|
|
|
catch (ParserConfigurationException e) {
|
|
|
|
System.err.println("Oops - failed to get XML parser!?");
|
|
|
|
e.printStackTrace();
|
|
|
|
}
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|