DocumentBuilder
objects. */
private static DocumentBuilderFactory factory =
DocumentBuilderFactory.newInstance();
/** DOM Document
of content.xml. */
private Document contentDoc = null;
/** DOM Document
of meta.xml. */
private Document metaDoc = null;
/** DOM Document
of settings.xml. */
private Document settingsDoc = null;
/** DOM Document
of content.xml. */
private Document styleDoc = null;
/** DOM Document
of META-INF/manifest.xml. */
private Document manifestDoc = null;
private String documentName = null;
private String fileName = null;
/**
* OfficeZip
object to store zip contents from
* read InputStream
. Note that this member
* will still be null if it was initialized using a template
* file instead of reading from a StarOffice zipped
* XML file.
*/
private OfficeZip zip = null;
/** Collection to keep track of the embedded objects in the document. */
private MapDocument
name.
*/
public OfficeDocument(String name)
{
this(name, true, false);
}
/**
* Constructor with arguments to set namespaceAware
* and validating
flags.
*
* @param name Document
name (may or may not
* contain extension).
* @param namespaceAware Value for namespaceAware
flag.
* @param validating Value for validating
flag.
*/
public OfficeDocument(String name, boolean namespaceAware, boolean validating) {
//res = Resources.getInstance();
factory.setValidating(validating);
factory.setNamespaceAware(namespaceAware);
this.documentName = trimDocumentName(name);
this.fileName = documentName + getFileExtension();
}
/**
* Removes the file extension from the Document
* name.
*
* @param name Full Document
name with extension.
*
* @return Name of Document
without the extension.
*/
private String trimDocumentName(String name) {
String temp = name.toLowerCase();
String ext = getFileExtension();
if (temp.endsWith(ext)) {
// strip the extension
int nlen = name.length();
int endIndex = nlen - ext.length();
name = name.substring(0,endIndex);
}
return name;
}
// FIX2 (HJ): Determine wether this is package or flat format
/** Package or flat format?
* @return true if the document is in package format, false if it's flat xml
*/
public boolean isPackageFormat() { return zip!=null; }
/**
* Return a DOM Document
object of the content.xml
* file. Note that a content DOM is not created when the constructor
* is called. So, either the read
method or the
* initContentDOM
method will need to be called ahead
* on this object before calling this method.
*
* @return DOM Document
object.
*/
public Document getContentDOM() {
return contentDoc;
}
/**
* Return a DOM Document
object of the meta.xml
* file. Note that a content DOM is not created when the constructor
* is called. So, either the read
method or the
* initContentDOM
method will need to be called ahead
* on this object before calling this method.
*
* @return DOM Document
object.
*/
public Document getMetaDOM() {
return metaDoc;
}
/**
* Return a DOM Document
object of the settings.xml
* file. Note that a content DOM is not created when the constructor
* is called. So, either the read
method or the
* initContentDOM
method will need to be called ahead
* on this object before calling this method.
*
* @return DOM Document
object.
*/
public Document getSettingsDOM() {
return settingsDoc;
}
/**
* Return a DOM Document
object of the style.xml file.
* Note that this may return null if there is no style DOM.
* Note that a style DOM is not created when the constructor
* is called. Depending on the InputStream
, a
* read
method may or may not build a style DOM. When
* creating a new style DOM, call the initStyleDOM
method
* first.
*
* @return DOM Document
object.
*/
public Document getStyleDOM() {
return styleDoc;
}
/**
* Return the name of the Document
.
*
* @return The name of Document
.
*/
public String getName() {
return documentName;
}
/**
* Return the file name of the Document
, possibly
* with the standard extension.
*
* @return The file name of Document
.
*/
public String getFileName() {
return fileName;
}
/**
* Returns the file extension for this type of
* Document
.
*
* @return The file extension of Document
.
*/
// TODO: is this used?
protected String getFileExtension() { return ""; }
/**
* Returns all the embedded objects (graphics, formulae, etc.) present in
* this document.
*
* @return An Iterator
of EmbeddedObject
objects.
*/
public IteratorEmbeddedObject
instance representing the named
* object.
*/
public EmbeddedObject getEmbeddedObject(String name) {
if (name == null) {
return null;
}
if (embeddedObjects == null) {
// FIX2 (HJ): Return null if there's no manifest
if (manifestDoc != null) {
getEmbeddedObjects();
}
else {
return null;
}
}
if (embeddedObjects.containsKey(name)) {
return embeddedObjects.get(name);
}
else {
return null;
}
}
/**
* Adds a new embedded object to the document.
*
* @param embObj An instance of EmbeddedObject
.
*/
/*public void addEmbeddedObject(EmbeddedObject embObj) {
if (embObj == null) {
return;
}
if (embeddedObjects == null) {
embeddedObjects = new HashMapDocument
from the given
* InputStream
.
* FIX3 (HJ): Perform simple type detection to determine package or flat format
*
* @param is Office document InputStream
.
*
* @throws IOException If any I/O error occurs.
*/
public void read(InputStream is) throws IOException {
byte[] doc = Misc.inputStreamToByteArray(is);
boolean bZip = MIMETypes.ZIP.equals(MIMETypes.getMagicMIMEType(doc));
// if it's zip, assume package - otherwise assume flat
read(new ByteArrayInputStream(doc),bZip);
}
private void readZip(InputStream is) throws IOException {
// Debug.log(Debug.INFO, "reading Office file");
DocumentBuilder builder = null;
try {
builder = factory.newDocumentBuilder();
} catch (ParserConfigurationException ex) {
throw new OfficeDocumentException(ex);
}
// read in Office zip file format
zip = new OfficeZip();
zip.read(is);
// grab the content.xml and
// parse it into contentDoc.
byte contentBytes[] = zip.getContentXMLBytes();
if (contentBytes == null) {
throw new OfficeDocumentException("Entry content.xml not found in file");
}
try {
contentDoc = parse(builder, contentBytes);
} catch (SAXException ex) {
throw new OfficeDocumentException(ex);
}
// if style.xml exists, grab the style.xml
// parse it into styleDoc.
byte styleBytes[] = zip.getStyleXMLBytes();
if (styleBytes != null) {
try {
styleDoc = parse(builder, styleBytes);
} catch (SAXException ex) {
throw new OfficeDocumentException(ex);
}
}
byte metaBytes[] = zip.getMetaXMLBytes();
if (metaBytes != null) {
try {
metaDoc = parse(builder, metaBytes);
} catch (SAXException ex) {
throw new OfficeDocumentException(ex);
}
}
byte settingsBytes[] = zip.getSettingsXMLBytes();
if (settingsBytes != null) {
try {
settingsDoc = parse(builder, settingsBytes);
} catch (SAXException ex) {
throw new OfficeDocumentException(ex);
}
}
// Read in the META-INF/manifest.xml file
byte manifestBytes[] = zip.getManifestXMLBytes();
if (manifestBytes != null) {
try {
manifestDoc = parse(builder, manifestBytes);
} catch (SAXException ex) {
throw new OfficeDocumentException(ex);
}
}
}
/**
* Read the Office Document
from the given
* InputStream
.
*
* @param is Office document InputStream
.
* @param isZip boolean
Identifies whether
* a file is zipped or not
*
* @throws IOException If any I/O error occurs.
*/
public void read(InputStream is, boolean isZip) throws IOException {
// Debug.log(Debug.INFO, "reading Office file");
DocumentBuilder builder = null;
try {
builder = factory.newDocumentBuilder();
} catch (ParserConfigurationException ex) {
throw new OfficeDocumentException(ex);
}
if (isZip)
{
readZip(is);
}
else{
try{
//contentDoc= builder.parse((InputStream)is);
Reader r = secondHack(is);
InputSource ins = new InputSource(r);
org.w3c.dom.Document newDoc = builder.parse(ins);
//org.w3c.dom.Document newDoc = builder.parse((InputStream)is);
Element rootElement=newDoc.getDocumentElement();
NodeList nodeList;
Node tmpNode;
Node rootNode = (Node)rootElement;
if (newDoc !=null){
/*content*/
contentDoc = createDOM(TAG_OFFICE_DOCUMENT_CONTENT);
rootElement=contentDoc.getDocumentElement();
rootNode = (Node)rootElement;
// FIX (HJ): Include office:font-decls in content DOM
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_FONT_DECLS);
if (nodeList.getLength()>0){
tmpNode = contentDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
// FIX2 (HJ): Include office:font-face-decls (OpenDocument) in content DOM
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_FONT_FACE_DECLS);
if (nodeList.getLength()>0){
tmpNode = contentDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES);
if (nodeList.getLength()>0){
tmpNode = contentDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_BODY);
if (nodeList.getLength()>0){
tmpNode = contentDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
/*Styles*/
styleDoc = createDOM(TAG_OFFICE_DOCUMENT_STYLES);
rootElement=styleDoc.getDocumentElement();
rootNode = (Node)rootElement;
// FIX (HJ): Include office:font-decls in styles DOM
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_FONT_DECLS);
if (nodeList.getLength()>0){
tmpNode = styleDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
// FIX2 (HJ): Include office:font-face-decls in styles DOM
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_FONT_FACE_DECLS);
if (nodeList.getLength()>0){
tmpNode = styleDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_STYLES);
if (nodeList.getLength()>0){
tmpNode = styleDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
// FIX (HJ): Include office:automatic-styles in styles DOM
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES);
if (nodeList.getLength()>0){
tmpNode = styleDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
// FIX (HJ): Include office:master-styles in styles DOM
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES);
if (nodeList.getLength()>0){
tmpNode = styleDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
/*Settings*/
settingsDoc = createDOM(TAG_OFFICE_DOCUMENT_SETTINGS);
rootElement=settingsDoc.getDocumentElement();
rootNode = (Node)rootElement;
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_SETTINGS);
if (nodeList.getLength()>0){
tmpNode = settingsDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
/*Meta*/
metaDoc = createDOM(TAG_OFFICE_DOCUMENT_META);
rootElement=metaDoc.getDocumentElement();
rootNode = (Node)rootElement;
nodeList= newDoc.getElementsByTagName(TAG_OFFICE_META);
if (nodeList.getLength()>0){
tmpNode = metaDoc.importNode(nodeList.item(0),true);
rootNode.appendChild(tmpNode);
}
}
}
catch (SAXException ex) {
throw new OfficeDocumentException(ex);
}
}
}
/**
* Parse given byte
array into a DOM
* Document
object using the
* DocumentBuilder
object.
*
* @param builder DocumentBuilder
object for parsing.
* @param bytes byte
array for parsing.
*
* @return Resulting DOM Document
object.
*
* @throws SAXException If any parsing error occurs.
*/
static Document parse(DocumentBuilder builder, byte bytes[])
throws SAXException, IOException {
Document doc = null;
ByteArrayInputStream is = new ByteArrayInputStream(bytes);
// TODO: replace hack with a more appropriate fix.
Reader r = hack(is);
InputSource ins = new InputSource(r);
doc = builder.parse(ins);
return doc;
}
/**
* Creates a new DOM Document
containing minimum
* OpenOffice XML tags.
This method uses the subclass
* getOfficeClassAttribute
method to get the
* attribute for office:class.
Document
.
*
* @throws IOException If any I/O error occurs.
*/
private final Document createDOM(String rootName) throws IOException {
Document doc = null;
try {
DocumentBuilder builder = factory.newDocumentBuilder();
doc = builder.newDocument();
} catch (ParserConfigurationException ex) {
throw new OfficeDocumentException(ex);
}
Element root = (Element) doc.createElement(rootName);
doc.appendChild(root);
root.setAttribute("xmlns:office", "http://openoffice.org/2000/office");
root.setAttribute("xmlns:style", "http://openoffice.org/2000/style");
root.setAttribute("xmlns:text", "http://openoffice.org/2000/text");
root.setAttribute("xmlns:table", "http://openoffice.org/2000/table");
root.setAttribute("xmlns:draw", "http://openoffice.org/2000/drawing");
root.setAttribute("xmlns:fo", "http://www.w3.org/1999/XSL/Format");
root.setAttribute("xmlns:xlink", "http://www.w3.org/1999/xlink");
root.setAttribute("xmlns:number", "http://openoffice.org/2000/datastyle");
root.setAttribute("xmlns:svg", "http://www.w3.org/2000/svg");
root.setAttribute("xmlns:chart", "http://openoffice.org/2000/chart");
root.setAttribute("xmlns:dr3d", "http://openoffice.org/2000/dr3d");
root.setAttribute("xmlns:math", "http://www.w3.org/1998/Math/MathML");
root.setAttribute("xmlns:form", "http://openoffice.org/2000/form");
root.setAttribute("xmlns:script", "http://openoffice.org/2000/script");
root.setAttribute("office:class", getOfficeClassAttribute());
root.setAttribute("office:version", "1.0");
return doc;
}
/**
* Return the office:class attribute value.
*
* @return The attribute value.
*/
// not really used...
protected String getOfficeClassAttribute() { return ""; }
/**
* Hacked code to filter tag before * sending stream to parser.
* *This hacked code needs to be changed later on.
* *Issue: using current jaxp1.0 parser, there is no way * to turn off processing of dtds. Current set of dtds * have bugs, processing them will throw exceptions.
* *This is a simple hack that assumes the whole * tag are all in the same line. This is sufficient for * current StarOffice 6.0 generated XML files. Since this * hack really needs to go away, I don't want to spend * too much time in making it a perfect hack.
* FIX (HJ): Removed requirement for DOCTYPE to be in one line * FIX (HJ): No longer removes newlines * * @param isInputStream
to be filtered.
*
* @return Reader value without the tag.
*
* @throws IOException If any I/O error occurs.
*/
private static Reader hack(InputStream is) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
StringBuffer buffer = new StringBuffer();
String str = null;
while ((str = br.readLine()) != null) {
int sIndex = str.indexOf(" -1) {
buffer.append(str.substring(0, sIndex));
int eIndex = str.indexOf('>', sIndex + 8 );
if (eIndex > -1) {
buffer.append(str.substring(eIndex + 1, str.length()));
// FIX (HJ): Preserve the newline
buffer.append("\n");
} else {
// FIX (HJ): More than one line. Search for '>' in following lines
boolean bOK = false;
while ((str = br.readLine())!=null) {
eIndex = str.indexOf('>');
if (eIndex>-1) {
buffer.append(str.substring(eIndex+1));
// FIX (HJ): Preserve the newline
buffer.append("\n");
bOK = true;
break;
}
}
if (!bOK) { throw new IOException("Invalid XML"); }
}
} else {
buffer.append(str);
// FIX (HJ): Preserve the newline
buffer.append("\n");
}
}
StringReader r = new StringReader(buffer.toString());
return r;
}
/**
* Transform the InputStream to a Reader Stream.
* *This hacked code needs to be changed later on.
* *Issue: the new oasis input file stream means * that the old input stream fails. see #i33702#
* * @param isInputStream
to be filtered.
*
* @return Reader value of the InputStream().
*
* @throws IOException If any I/O error occurs.
*/
private static Reader secondHack(InputStream is) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
char[] charArray = new char[4096];
StringBuffer sBuf = new StringBuffer();
int n = 0;
while ((n=br.read(charArray, 0, charArray.length)) > 0)
sBuf.append(charArray, 0, n);
// ensure there is no trailing garbage after the end of the stream.
int sIndex = sBuf.lastIndexOf("");
sBuf.delete(sIndex, sBuf.length());
sBuf.append("");
StringReader r = new StringReader(sBuf.toString());
return r;
}
}