/************************************************************************
*
* OfficeReader.java
*
* Copyright: 2002-2015 by Henrik Just
*
* This file is part of Writer2LaTeX.
*
* Writer2LaTeX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Writer2LaTeX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Writer2LaTeX. If not, see
This class reads and collects global information about an OOo document. * This includes styles, forms, information about indexes and references etc. *
*/ public class OfficeReader { /////////////////////////////////////////////////////////////////////////// // Static methods /** Checks, if a node is an element in the text namespace * @param node the node to check * @return true if this is a text element */ public static boolean isTextElement(Node node) { return node.getNodeType()==Node.ELEMENT_NODE && node.getNodeName().startsWith(XMLString.TEXT_); } /** Checks, if a node is an element in the table namespace * @param node the node to check * @return true if this is a table element */ public static boolean isTableElement(Node node) { return node.getNodeType()==Node.ELEMENT_NODE && node.getNodeName().startsWith(XMLString.TABLE_); } /** Checks, if a node is an element in the draw namespace * @param node the node to check * @return true if this is a draw element */ public static boolean isDrawElement(Node node) { return node.getNodeType()==Node.ELEMENT_NODE && node.getNodeName().startsWith(XMLString.DRAW_); } /** Checks, if a node is an element representing a note (footnote/endnote) * @param node the node to check * @return true if this is a note element */ public static boolean isNoteElement(Node node) { return node.getNodeType()==Node.ELEMENT_NODE && ( node.getNodeName().equals(XMLString.TEXT_NOTE) || node.getNodeName().equals(XMLString.TEXT_FOOTNOTE) || node.getNodeName().equals(XMLString.TEXT_ENDNOTE) ); } /** Get the paragraph or heading containing a node * * @param node the node in question * @return the paragraph or heading */ public static Element getParagraph(Element node) { Element parent = (Element) node.getParentNode(); if (parent.getTagName().equals(XMLString.TEXT_P) || parent.getTagName().equals(XMLString.TEXT_H)) { return parent; } return getParagraph(parent); } /** Checks, if this node contains at most one element, and that this is a * paragraph. * @param node the node to check * @return true if the node contains a single paragraph or nothing */ public static boolean isSingleParagraph(Node node) { boolean bFoundPar = false; Node child = node.getFirstChild(); while (child!=null) { if (child.getNodeType()==Node.ELEMENT_NODE) { if (child.getNodeName().equals(XMLString.TEXT_P)) { if (bFoundPar) { return false; } else { bFoundPar = true; } } else { return false; } } child = child.getNextSibling(); } return bFoundPar; } /** Checks, if the only text content of this node is whitespace. * Other (draw) content is allowed. * @param node the node to check (should be a paragraph node or a child * of a paragraph node) * @return true if the node contains whitespace only */ public static boolean isNoTextPar(Node node) { Node child = node.getFirstChild(); while (child!=null) { if (child.getNodeType()==Node.ELEMENT_NODE) { if (isTextElement(child)) { if (!isWhitespaceContent(child)) { return false; } } } else if (child.getNodeType()==Node.TEXT_NODE) { if (!isWhitespace(child.getNodeValue())) { return false; } } child = child.getNextSibling(); } return true; // found nothing! } /**Checks, if the only text content of this node is whitespace
* @param node the node to check (should be a paragraph node or a child * of a paragraph node) * @return true if the node contains whitespace only */ public static boolean isWhitespaceContent(Node node) { Node child = node.getFirstChild(); while (child!=null) { if (child.getNodeType()==Node.ELEMENT_NODE) { if (isTextElement(child)) { if (!isWhitespaceContent(child)) { return false; } } else { return false; // found non-text content! } } else if (child.getNodeType()==Node.TEXT_NODE) { if (!isWhitespace(child.getNodeValue())) { return false; } } child = child.getNextSibling(); } return true; // found nothing! } /**Checks, if this text is whitespace
* @param s the String to check * @return true if the String contains whitespace only */ public static boolean isWhitespace(String s) { int nLen = s.length(); for (int i=0; iGet the collection of all font declarations.
* @return theOfficeStyleFamily
of font declarations
*/
public OfficeStyleFamily getFontDeclarations() { return font; }
/** Get a specific font declaration
* @param sName the name of the font declaration * @return aFontDeclaration
representing the font
*/
public FontDeclaration getFontDeclaration(String sName) {
return (FontDeclaration) font.getStyle(sName);
}
// Accessor methods for styles
public OfficeStyleFamily getTextStyles() { return text; }
public StyleWithProperties getTextStyle(String sName) {
return (StyleWithProperties) text.getStyle(sName);
}
public OfficeStyleFamily getParStyles() { return par; }
public StyleWithProperties getParStyle(String sName) {
return (StyleWithProperties) par.getStyle(sName);
}
public StyleWithProperties getDefaultParStyle() {
return (StyleWithProperties) par.getDefaultStyle();
}
public OfficeStyleFamily getSectionStyles() { return section; }
public StyleWithProperties getSectionStyle(String sName) {
return (StyleWithProperties) section.getStyle(sName);
}
public OfficeStyleFamily getTableStyles() { return table; }
public StyleWithProperties getTableStyle(String sName) {
return (StyleWithProperties) table.getStyle(sName);
}
public OfficeStyleFamily getColumnStyles() { return column; }
public StyleWithProperties getColumnStyle(String sName) {
return (StyleWithProperties) column.getStyle(sName);
}
public OfficeStyleFamily getRowStyles() { return row; }
public StyleWithProperties getRowStyle(String sName) {
return (StyleWithProperties) row.getStyle(sName);
}
public OfficeStyleFamily getCellStyles() { return cell; }
public StyleWithProperties getCellStyle(String sName) {
return (StyleWithProperties) cell.getStyle(sName);
}
public StyleWithProperties getDefaultCellStyle() {
return (StyleWithProperties) cell.getDefaultStyle();
}
public OfficeStyleFamily getFrameStyles() { return frame; }
public StyleWithProperties getFrameStyle(String sName) {
return (StyleWithProperties) frame.getStyle(sName);
}
public StyleWithProperties getDefaultFrameStyle() {
return (StyleWithProperties) frame.getDefaultStyle();
}
public OfficeStyleFamily getPresentationStyles() { return presentation; }
public StyleWithProperties getPresentationStyle(String sName) {
return (StyleWithProperties) presentation.getStyle(sName);
}
public StyleWithProperties getDefaultPresentationStyle() {
return (StyleWithProperties) presentation.getDefaultStyle();
}
public OfficeStyleFamily getDrawingPageStyles() { return drawingPage; }
public StyleWithProperties getDrawingPageStyle(String sName) {
return (StyleWithProperties) drawingPage.getStyle(sName);
}
public StyleWithProperties getDefaultDrawingPageStyle() {
return (StyleWithProperties) drawingPage.getDefaultStyle();
}
public OfficeStyleFamily getListStyles() { return list; }
public ListStyle getListStyle(String sName) {
return (ListStyle) list.getStyle(sName);
}
public OfficeStyleFamily getPageLayouts() { return pageLayout; }
public PageLayout getPageLayout(String sName) {
return (PageLayout) pageLayout.getStyle(sName);
}
public OfficeStyleFamily getMasterPages() { return masterPage; }
public MasterPage getMasterPage(String sName) {
return (MasterPage) masterPage.getStyle(sName);
}
public MapReturns the paragraph style associated with headings of a specific
* level. Returns null
if no such style is known.
*
In principle, different styles can be used for each heading, in
* practice the same (soft) style is used for all headings of a specific
* level.
* @param nLevel the level of the heading
* @return a StyleWithProperties
object representing the style
*/
public StyleWithProperties getHeadingStyle(int nLevel) {
return 1<=nLevel && nLevel<=10 ? heading[nLevel] : null;
}
/**
Returns the first master page used in the document. If no master
* page is used explicitly, the first master page found in the styles is
* returned. Returns null if no master pages exists.
* @return a Returns a reader for a specific toc
* @param onode the Is this style used in some toc as an index source style? Does this sequence name belong to a lof? Does this sequence name belong to a lot? Add a sequence name for table captions. OpenDocument has a very weak notion of table captions: A caption is a
* paragraph containing a text:sequence element. Moreover, the only source
* to identify which sequence number to use is the list(s) of tables.
* If there's no list of tables, captions cannot be identified.
* Thus this method lets the user add a sequence name to identify the
* table captions.
* @param sName the name to add
*/
public void addTableSequenceName(String sName) {
tableSequenceNames.add(sName);
}
/** Add a sequence name for figure captions. OpenDocument has a very weak notion of figure captions: A caption is a
* paragraph containing a text:sequence element. Moreover, the only source
* to identify which sequence number to use is the list(s) of figures.
* If there's no list of figures, captions cannot be identified.
* Thus this method lets the user add a sequence name to identify the
* figure captions.
* @param sName the name to add
*/
public void addFigureSequenceName(String sName) {
figureSequenceNames.add(sName);
}
/** Get the sequence name associated with a paragraph Get the sequence name associated with a reference name Is there a reference to this note id?
* @param sId the id of the note
* @return true if there is a reference
*/
public boolean hasNoteRefTo(String sId) {
return footnoteRef.contains(sId) || endnoteRef.contains(sId);
}
/** Is there a reference to this footnote id?
* @param sId the id of the footnote
* @return true if there is a reference
*/
public boolean hasFootnoteRefTo(String sId) {
return footnoteRef.contains(sId);
}
/** Is there a reference to this endnote?
* @param sId the id of the endnote
* @return true if there is a reference
*/
public boolean hasEndnoteRefTo(String sId) {
return endnoteRef.contains(sId);
}
/** Is this reference mark contained in a heading?
* @param sName the name of the reference mark
* @return true if so
*/
public boolean referenceMarkInHeading(String sName) {
return referenceHeading.containsKey(sName);
}
/** Is there a reference to this reference mark?
* @param sName the name of the reference mark
* @return true if there is a reference
*/
public boolean hasReferenceRefTo(String sName) {
return referenceRef.contains(sName);
}
/** Is this bookmark contained in a heading?
* @param sName the name of the bookmark
* @return true if so
*/
public boolean bookmarkInHeading(String sName) {
return bookmarkHeading.containsKey(sName);
}
/** Get the level of the heading associated with this bookmark
* @param sName the name of the bookmark
* @return the level or 0 if the bookmark does not exist
*/
public int getBookmarkHeadingLevel(String sName) {
return bookmarkHeading.get(sName);
}
/** Is this bookmark contained in a list?
* @param sName the name of the bookmark
* @return true if so
*/
public boolean bookmarkInList(String sName) {
return bookmarkList.containsKey(sName);
}
/** Get the list style name associated with a bookmark in a list
* @param sName the name of the bookmark
* @return the list style name or null if the bookmark does not exist or the list does not have a style name
*/
public String getBookmarkListStyle(String sName) {
if (bookmarkList.containsKey(sName)) {
return bookmarkList.get(sName);
}
else {
return null;
}
}
/** Get the list level associated with a bookmark in a list
* @param sName the name of the bookmark
* @return the level or 0 if the bookmark does not exist
*/
public int getBookmarkListLevel(String sName) {
if (bookmarkListLevel.containsKey(sName)) {
return bookmarkListLevel.get(sName);
}
else {
return 0;
}
}
/** Is there a reference to this bookmark?
* @param sName the name of the bookmark
* @return true if there is a reference
*/
public boolean hasBookmarkRefTo(String sName) {
return bookmarkRef.contains(sName);
}
/** Get the raw list of all text:bibliography-mark elements. The marks are returned in document order and
* includes any duplicates
*
* @return the list
*/
public List Is there a reference to this sequence field?
* @param sId the id of the sequence field
* @return true if there is a reference
*/
public boolean hasSequenceRefTo(String sId) {
return sequenceRef.contains(sId);
}
/** Is there a link to this sequence anchor name?
* @param sName the name of the anchor
* @return true if there is a link
*/
public boolean hasLinkTo(String sName) {
return links.contains(sName);
}
/** Is this an OASIS OpenDocument or an OOo 1.0 document?
* @return true if it's an OASIS OpenDocument
*/
public boolean isOpenDocument() { return bOpenDocument; }
/** Is this an text document?
* @return true if it's a text document
*/
public boolean isText() { return bText; }
/** Is this a spreadsheet document?
* @return true if it's a spreadsheet document
*/
public boolean isSpreadsheet() { return bSpreadsheet; }
/** Is this a presentation document?
* @return true if it's a presentation document
*/
public boolean isPresentation() { return bPresentation; }
/** Get the content element In the old file format this means the In the OpenDocument format this means a Get the forms belonging to this document. Read a table from a table:table nodeMasterPage
object representing the master page
*/
public MasterPage getFirstMasterPage() { return firstMasterPage; }
/** Return the iso language used in most paragaph styles (in a well-structured
* document this will be the default language)
* TODO: Base on content rather than style
* @return the iso language
*/
public String getMajorityLanguage() {
Hashtabletext:table-of-content-node
* @return the reader, or null
*/
public TocReader getTocReader(Element onode) {
if (indexes.containsKey(onode)) { return (TocReader) indexes.get(onode); }
else { return null; }
}
/** office:body
element
* office:text
,
* office:spreadsheet
or office:presentation
* element.Element
*/
public Element getContent() {
return content;
}
/** FormsReader
representing the forms
*/
public FormsReader getForms() { return forms; }
/** TableReader
object representing the table
*/
public TableReader getTableReader(Element node) {
return new TableReader(this,node);
}
/** Get the very first image in this document, if any
*
* @return the first image, or null if no images exists
*/
public Element getFirstImage() {
return firstImage;
}
/** Constructor; read a document */
public OfficeReader(OfficeDocument oooDoc, boolean bAllParagraphsAreSoft) {
this.oooDoc = oooDoc;
loadStylesFromDOM(oooDoc.getStyleDOM(),oooDoc.getContentDOM(),bAllParagraphsAreSoft);
loadContentFromDOM(oooDoc.getContentDOM());
}
///////////////////////////////////////////////////////////////////////////
// Helpers
/*private void collectMasterPage(StyleWithProperties style) {
if (style==null || firstMasterPage!=null) { return; }
String s = style.getMasterPageName();
if (s!=null && s.length()>0) {
firstMasterPage = getMasterPage(s);
}
}*/
private void loadStylesFromDOM(Node node, boolean bAllParagraphsAreSoft) {
// node should be office:master-styles, office:styles or office:automatic-styles
boolean bAutomatic = XMLString.OFFICE_AUTOMATIC_STYLES.equals(node.getNodeName());
if (node.hasChildNodes()){
NodeList nl = node.getChildNodes();
int nLen = nl.getLength();
for (int i = 0; i < nLen; i++ ) {
Node child=nl.item(i);
if (child.getNodeType()==Node.ELEMENT_NODE){
if (child.getNodeName().equals(XMLString.STYLE_STYLE)){
String sFamily = Misc.getAttribute(child,XMLString.STYLE_FAMILY);
if ("text".equals(sFamily)){
text.loadStyleFromDOM(child,bAutomatic);
}
else if ("paragraph".equals(sFamily)){
par.loadStyleFromDOM(child,bAutomatic && !bAllParagraphsAreSoft);
}
else if ("section".equals(sFamily)){
section.loadStyleFromDOM(child,bAutomatic);
}
else if ("table".equals(sFamily)){
table.loadStyleFromDOM(child,bAutomatic);
}
else if ("table-column".equals(sFamily)){
column.loadStyleFromDOM(child,bAutomatic);
}
else if ("table-row".equals(sFamily)){
row.loadStyleFromDOM(child,bAutomatic);
}
else if ("table-cell".equals(sFamily)){
cell.loadStyleFromDOM(child,bAutomatic);
}
else if ("graphics".equals(sFamily)){
frame.loadStyleFromDOM(child,bAutomatic);
}
else if ("graphic".equals(sFamily)){ // oasis
frame.loadStyleFromDOM(child,bAutomatic);
}
else if ("presentation".equals(sFamily)){
presentation.loadStyleFromDOM(child,bAutomatic);
}
else if ("drawing-page".equals(sFamily)){
// Bug in OOo 1.x: The same name may be used for a real and an automatic style...
if (drawingPage.getStyle(Misc.getAttribute(child,XMLString.STYLE_NAME))==null) {
drawingPage.loadStyleFromDOM(child,bAutomatic);
}
}
}
else if (child.getNodeName().equals(XMLString.STYLE_PAGE_MASTER)) { // old
pageLayout.loadStyleFromDOM(child,bAutomatic);
}
else if (child.getNodeName().equals(XMLString.STYLE_PAGE_LAYOUT)) { // oasis
pageLayout.loadStyleFromDOM(child,bAutomatic);
}
else if (child.getNodeName().equals(XMLString.STYLE_MASTER_PAGE)) {
masterPage.loadStyleFromDOM(child,bAutomatic);
masterPages.put(Misc.getAttribute(child,XMLString.STYLE_NAME), (MasterPage) masterPage.getStyle(Misc.getAttribute(child,XMLString.STYLE_NAME)) );
if (firstMasterPage==null) {
firstMasterPage = (MasterPage) masterPage.getStyle(Misc.getAttribute(child,XMLString.STYLE_NAME));
}
}
else if (child.getNodeName().equals(XMLString.TEXT_LIST_STYLE)) {
list.loadStyleFromDOM(child,bAutomatic);
}
else if (child.getNodeName().equals(XMLString.TEXT_OUTLINE_STYLE)) {
outline.loadStyleFromDOM(child);
}
else if (child.getNodeName().equals(XMLString.STYLE_DEFAULT_STYLE)){
String sFamily = Misc.getAttribute(child,XMLString.STYLE_FAMILY);
if ("paragraph".equals(sFamily)) {
StyleWithProperties defaultPar = new StyleWithProperties();
defaultPar.loadStyleFromDOM(child);
par.setDefaultStyle(defaultPar);
}
else if ("graphics".equals(sFamily) || "graphic".equals(sFamily)) { // oasis: no s
StyleWithProperties defaultFrame = new StyleWithProperties();
defaultFrame.loadStyleFromDOM(child);
frame.setDefaultStyle(defaultFrame);
}
else if ("table-cell".equals(sFamily)) {
StyleWithProperties defaultCell = new StyleWithProperties();
defaultCell.loadStyleFromDOM(child);
cell.setDefaultStyle(defaultCell);
}
}
}
}
}
}
private void loadStylesFromDOM(Document stylesDOM, Document contentDOM, boolean bAllParagraphsAreSoft){
// Flat xml: stylesDOM will be null and contentDOM contain everything
// This is only the case for old versions of xmerge; newer versions
// creates DOM for styles, content, meta and settings.
NodeList list;
// font declarations: Try old format first
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.OFFICE_FONT_DECLS);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_FONT_DECLS);
}
// If that fails, try oasis format
if (list.getLength()==0) {
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.OFFICE_FONT_FACE_DECLS);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_FONT_FACE_DECLS);
}
}
if (list.getLength()!=0) {
Node node = list.item(0);
if (node.hasChildNodes()){
NodeList nl = node.getChildNodes();
int nLen = nl.getLength();
for (int i = 0; i < nLen; i++ ) {
Node child = nl.item(i);
if (child.getNodeType()==Node.ELEMENT_NODE){
if (child.getNodeName().equals(XMLString.STYLE_FONT_DECL)){
font.loadStyleFromDOM(child,false);
}
else if (child.getNodeName().equals(XMLString.STYLE_FONT_FACE)){
font.loadStyleFromDOM(child,false);
}
}
}
}
}
// soft formatting:
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.OFFICE_STYLES);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_STYLES);
}
if (list.getLength()!=0) {
loadStylesFromDOM(list.item(0),bAllParagraphsAreSoft);
}
// master styles:
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.OFFICE_MASTER_STYLES);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_MASTER_STYLES);
}
if (list.getLength()!=0) {
loadStylesFromDOM(list.item(0),bAllParagraphsAreSoft);
}
// hard formatting:
// Load from styles.xml first. Problem: There may be name clashes
// with automatic styles from content.xml
if (stylesDOM!=null) {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_AUTOMATIC_STYLES);
if (list.getLength()!=0) {
loadStylesFromDOM(list.item(0),bAllParagraphsAreSoft);
}
}
list = contentDOM.getElementsByTagName(XMLString.OFFICE_AUTOMATIC_STYLES);
if (list.getLength()!=0) {
loadStylesFromDOM(list.item(0),bAllParagraphsAreSoft);
}
// footnotes configuration:
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.TEXT_FOOTNOTES_CONFIGURATION);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.TEXT_FOOTNOTES_CONFIGURATION);
}
if (list.getLength()!=0) {
footnotes = new PropertySet();
footnotes.loadFromDOM(list.item(0));
}
// endnotes configuration:
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.TEXT_ENDNOTES_CONFIGURATION);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.TEXT_ENDNOTES_CONFIGURATION);
}
if (list.getLength()!=0) {
endnotes = new PropertySet();
endnotes.loadFromDOM(list.item(0));
}
// if it failed, try oasis format
if (footnotes==null || endnotes==null) {
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.TEXT_NOTES_CONFIGURATION);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.TEXT_NOTES_CONFIGURATION);
}
int nLen = list.getLength();
for (int i=0; i