/************************************************************************ * * OfficeReader.java * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA * * Copyright: 2002-2008 by Henrik Just * * All Rights Reserved. * * Version 1.0 (2008-09-22) * */ package writer2latex.office; import java.util.Enumeration; import java.util.Hashtable; import java.util.HashSet; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Document; import writer2latex.xmerge.OfficeDocument; import writer2latex.util.Misc; /**
This class reads and collects global information about an OOo document. * This includes styles, forms, information about indexes and references etc. *
*/ public class OfficeReader { /////////////////////////////////////////////////////////////////////////// // Static methods /** Checks, if a node is an element in the text namespace * @param node the node to check * @return true if this is a text element */ public static boolean isTextElement(Node node) { return node.getNodeType()==Node.ELEMENT_NODE && node.getNodeName().startsWith(XMLString.TEXT_); } /** Checks, if a node is an element in the table namespace * @param node the node to check * @return true if this is a table element */ public static boolean isTableElement(Node node) { return node.getNodeType()==Node.ELEMENT_NODE && node.getNodeName().startsWith(XMLString.TABLE_); } /** Checks, if a node is an element in the draw namespace * @param node the node to check * @return true if this is a draw element */ public static boolean isDrawElement(Node node) { return node.getNodeType()==Node.ELEMENT_NODE && node.getNodeName().startsWith(XMLString.DRAW_); } /** Checks, if a node is an element representing a note (footnote/endnote) * @param node the node to check * @return true if this is a note element */ public static boolean isNoteElement(Node node) { return node.getNodeType()==Node.ELEMENT_NODE && ( node.getNodeName().equals(XMLString.TEXT_NOTE) || node.getNodeName().equals(XMLString.TEXT_FOOTNOTE) || node.getNodeName().equals(XMLString.TEXT_ENDNOTE) ); } /** Checks, if this node contains at most one element, and that this is a * paragraph. * @param node the node to check * @return true if the node contains a single paragraph or nothing */ public static boolean isSingleParagraph(Node node) { boolean bFoundPar = false; Node child = node.getFirstChild(); while (child!=null) { if (child.getNodeType()==Node.ELEMENT_NODE) { if (child.getNodeName().equals(XMLString.TEXT_P)) { if (bFoundPar) { return false; } else { bFoundPar = true; } } else { return false; } } child = child.getNextSibling(); } return bFoundPar; } /**Checks, if the only text content of this node is whitespace
* @param node the node to check (should be a paragraph node or a child * of a paragraph node) * @return true if the node contains whitespace only */ public static boolean isWhitespaceContent(Node node) { Node child = node.getFirstChild(); while (child!=null) { if (child.getNodeType()==Node.ELEMENT_NODE) { if (child.getNodeName().equals(XMLString.TEXT_SPAN)) { if (!isWhitespaceContent(child)) { return false; } } else if (child.getNodeName().equals(XMLString.TEXT_A)) { if (!isWhitespaceContent(child)) { return false; } } else if (child.getNodeName().equals(XMLString.TEXT_BIBLIOGRAPHY_MARK)) { if (!isWhitespaceContent(child)) { return false; } } else if (!isTextElement(child)) { return false; // found non-text content! } } else if (child.getNodeType()==Node.TEXT_NODE) { if (!isWhitespace(child.getNodeValue())) { return false; } } child = child.getNextSibling(); } return true; // found nothing! } /**Checks, if this text is whitespace
* @param s the String to check * @return true if the String contains whitespace only */ public static boolean isWhitespace(String s) { int nLen = s.length(); for (int i=0; iGet the collection of all font declarations.
* @return theOfficeStyleFamily
of font declarations
*/
public OfficeStyleFamily getFontDeclarations() { return font; }
/** Get a specific font declaration
* @param sName the name of the font declaration * @return aFontDeclaration
representing the font
*/
public FontDeclaration getFontDeclaration(String sName) {
return (FontDeclaration) font.getStyle(sName);
}
// Accessor methods for styles
public OfficeStyleFamily getTextStyles() { return text; }
public StyleWithProperties getTextStyle(String sName) {
return (StyleWithProperties) text.getStyle(sName);
}
public OfficeStyleFamily getParStyles() { return par; }
public StyleWithProperties getParStyle(String sName) {
return (StyleWithProperties) par.getStyle(sName);
}
public StyleWithProperties getDefaultParStyle() {
return (StyleWithProperties) par.getDefaultStyle();
}
public OfficeStyleFamily getSectionStyles() { return section; }
public StyleWithProperties getSectionStyle(String sName) {
return (StyleWithProperties) section.getStyle(sName);
}
public OfficeStyleFamily getTableStyles() { return table; }
public StyleWithProperties getTableStyle(String sName) {
return (StyleWithProperties) table.getStyle(sName);
}
public OfficeStyleFamily getColumnStyles() { return column; }
public StyleWithProperties getColumnStyle(String sName) {
return (StyleWithProperties) column.getStyle(sName);
}
public OfficeStyleFamily getRowStyles() { return row; }
public StyleWithProperties getRowStyle(String sName) {
return (StyleWithProperties) row.getStyle(sName);
}
public OfficeStyleFamily getCellStyles() { return cell; }
public StyleWithProperties getCellStyle(String sName) {
return (StyleWithProperties) cell.getStyle(sName);
}
public StyleWithProperties getDefaultCellStyle() {
return (StyleWithProperties) cell.getDefaultStyle();
}
public OfficeStyleFamily getFrameStyles() { return frame; }
public StyleWithProperties getFrameStyle(String sName) {
return (StyleWithProperties) frame.getStyle(sName);
}
public StyleWithProperties getDefaultFrameStyle() {
return (StyleWithProperties) frame.getDefaultStyle();
}
public OfficeStyleFamily getPresentationStyles() { return presentation; }
public StyleWithProperties getPresentationStyle(String sName) {
return (StyleWithProperties) presentation.getStyle(sName);
}
public StyleWithProperties getDefaultPresentationStyle() {
return (StyleWithProperties) presentation.getDefaultStyle();
}
public OfficeStyleFamily getDrawingPageStyles() { return drawingPage; }
public StyleWithProperties getDrawingPageStyle(String sName) {
return (StyleWithProperties) drawingPage.getStyle(sName);
}
public StyleWithProperties getDefaultDrawingPageStyle() {
return (StyleWithProperties) drawingPage.getDefaultStyle();
}
public OfficeStyleFamily getListStyles() { return list; }
public ListStyle getListStyle(String sName) {
return (ListStyle) list.getStyle(sName);
}
public OfficeStyleFamily getPageLayouts() { return pageLayout; }
public PageLayout getPageLayout(String sName) {
return (PageLayout) pageLayout.getStyle(sName);
}
public OfficeStyleFamily getMasterPages() { return masterPage; }
public MasterPage getMasterPage(String sName) {
return (MasterPage) masterPage.getStyle(sName);
}
public ListStyle getOutlineStyle() { return outline; }
public PropertySet getFootnotesConfiguration() { return footnotes; }
public PropertySet getEndnotesConfiguration() { return endnotes; }
/** Returns the paragraph style associated with headings of a specific
* level. Returns null
if no such style is known.
*
In principle, different styles can be used for each heading, in
* practice the same (soft) style is used for all headings of a specific
* level.
* @param nLevel the level of the heading
* @return a StyleWithProperties
object representing the style
*/
public StyleWithProperties getHeadingStyle(int nLevel) {
return 1<=nLevel && nLevel<=10 ? heading[nLevel] : null;
}
/**
Returns the first master page used in the document. If no master
* page is used explicitly, the first master page found in the styles is
* returned. Returns null if no master pages exists.
* @return a MasterPage
object representing the master page
*/
public MasterPage getFirstMasterPage() { return firstMasterPage; }
/** Return the iso language used in most paragaph styles (in a well-structured
* document this will be the default language)
* TODO: Base on content rather than style
* @return the iso language
*/
public String getMajorityLanguage() {
Hashtable langs = new Hashtable();
// Read the default language from the default paragraph style
String sDefaultLang = null;
StyleWithProperties style = getDefaultParStyle();
if (style!=null) {
sDefaultLang = style.getProperty(XMLString.FO_LANGUAGE);
}
// Collect languages from paragraph styles
Enumeration enumeration = getParStyles().getStylesEnumeration();
while (enumeration.hasMoreElements()) {
style = (StyleWithProperties) enumeration.nextElement();
String sLang = style.getProperty(XMLString.FO_LANGUAGE);
if (sLang==null) { sLang = sDefaultLang; }
if (sLang!=null) {
int nCount = 1;
if (langs.containsKey(sLang)) {
nCount = ((Integer) langs.get(sLang)).intValue()+1;
}
langs.put(sLang,new Integer(nCount));
}
}
// Find the most common language
int nMaxCount = 0;
String sMajorityLanguage = null;
enumeration = langs.keys();
while (enumeration.hasMoreElements()) {
String sLang = (String) enumeration.nextElement();
int nCount = ((Integer) langs.get(sLang)).intValue();
if (nCount>nMaxCount) {
nMaxCount = nCount;
sMajorityLanguage = sLang;
}
}
return sMajorityLanguage;
}
/**
Returns a reader for a specific toc
* @param onode the text:table-of-content-node
* @return the reader, or null
*/
public TocReader getTocReader(Element onode) {
if (indexes.containsKey(onode)) { return (TocReader) indexes.get(onode); }
else { return null; }
}
/**
Is this style used in some toc as an index source style?
* @param sStyleName the name of the style * @return true if this is an index source style */ public boolean isIndexSourceStyle(String sStyleName) { return indexSourceStyles.contains(sStyleName); } /**Does this sequence name belong to a lof?
* @param sName the name of the sequence * @return true if it belongs to an index */ public boolean isFigureSequenceName(String sName) { return figureSequenceNames.contains(sName); } /**Does this sequence name belong to a lot?
* @param sName the name of the sequence * @return true if it belongs to an index */ public boolean isTableSequenceName(String sName) { return tableSequenceNames.contains(sName); } /**Add a sequence name for table captions.
*OpenDocument has a very weak notion of table captions: A caption is a * paragraph containing a text:sequence element. Moreover, the only source * to identify which sequence number to use is the list(s) of tables. * If there's no list of tables, captions cannot be identified. * Thus this method lets the user add a sequence name to identify the * table captions. * @param sName the name to add */ public void addTableSequenceName(String sName) { tableSequenceNames.add(sName); } /**
Add a sequence name for figure captions.
*OpenDocument has a very weak notion of figure captions: A caption is a * paragraph containing a text:sequence element. Moreover, the only source * to identify which sequence number to use is the list(s) of figures. * If there's no list of figures, captions cannot be identified. * Thus this method lets the user add a sequence name to identify the * figure captions. * @param sName the name to add */ public void addFigureSequenceName(String sName) { figureSequenceNames.add(sName); } /**
Get the sequence name associated with a paragraph
* @param par the paragraph to look up * @return the sequence name or null */ public String getSequenceName(Element par) { return sequenceNames.containsKey(par) ? (String) sequenceNames.get(par) : null; } /**Get the sequence name associated with a reference name
* @param sRefName the reference name to use * @return the sequence name or null */ public String getSequenceFromRef(String sRefName) { return (String) seqrefNames.get(sRefName); } /**Is there a reference to this footnote id? * @param sId the id of the footnote * @return true if there is a reference */ public boolean hasFootnoteRefTo(String sId) { return footnoteRef.contains(sId); } /**
Is there a reference to this endnote? * @param sId the id of the endnote * @return true if there is a reference */ public boolean hasEndnoteRefTo(String sId) { return endnoteRef.contains(sId); } /** Is this reference mark contained in a heading? * @param sName the name of the reference mark * @return true if so */ public boolean referenceMarkInHeading(String sName) { return referenceHeading.contains(sName); } /** Is there a reference to this reference mark? * @param sName the name of the reference mark * @return true if there is a reference */ public boolean hasReferenceRefTo(String sName) { return referenceRef.contains(sName); } /** Is this bookmark contained in a heading? * @param sName the name of the bookmark * @return true if so */ public boolean bookmarkInHeading(String sName) { return bookmarkHeading.contains(sName); } /**
Is there a reference to this bookmark? * @param sName the name of the bookmark * @return true if there is a reference */ public boolean hasBookmarkRefTo(String sName) { return bookmarkRef.contains(sName); } /**
Is there a reference to this sequence field? * @param sId the id of the sequence field * @return true if there is a reference */ public boolean hasSequenceRefTo(String sId) { return sequenceRef.contains(sId); } /**
Is there a link to this sequence anchor name? * @param sName the name of the anchor * @return true if there is a link */ public boolean hasLinkTo(String sName) { return links.contains(sName); } /**
Is this an OASIS OpenDocument or an OOo 1.0 document? * @return true if it's an OASIS OpenDocument */ public boolean isOpenDocument() { return bOpenDocument; } /**
Is this an text document? * @return true if it's a text document */ public boolean isText() { return bText; } /**
Is this a spreadsheet document? * @return true if it's a spreadsheet document */ public boolean isSpreadsheet() { return bSpreadsheet; } /**
Is this a presentation document? * @return true if it's a presentation document */ public boolean isPresentation() { return bPresentation; } /**
Get the content element
*In the old file format this means the office:body
element
*
In the OpenDocument format this means a office:text
,
* office:spreadsheet
or office:presentation
* element.
Element
*/
public Element getContent() {
return content;
}
/** Get the forms belonging to this document.
* @return aFormsReader
representing the forms
*/
public FormsReader getForms() { return forms; }
/** Read a table from a table:table node
* @param node the table:table Element node * @return aTableReader
object representing the table
*/
public TableReader getTableReader(Element node) {
return new TableReader(this,node);
}
/** Constructor; read a document */
public OfficeReader(OfficeDocument oooDoc, boolean bAllParagraphsAreSoft) {
this.oooDoc = oooDoc;
loadStylesFromDOM(oooDoc.getStyleDOM(),oooDoc.getContentDOM(),bAllParagraphsAreSoft);
loadContentFromDOM(oooDoc.getContentDOM());
}
///////////////////////////////////////////////////////////////////////////
// Helpers
/*private void collectMasterPage(StyleWithProperties style) {
if (style==null || firstMasterPage!=null) { return; }
String s = style.getMasterPageName();
if (s!=null && s.length()>0) {
firstMasterPage = getMasterPage(s);
}
}*/
private void loadStylesFromDOM(Node node, boolean bAllParagraphsAreSoft) {
// node should be office:master-styles, office:styles or office:automatic-styles
boolean bAutomatic = XMLString.OFFICE_AUTOMATIC_STYLES.equals(node.getNodeName());
if (node.hasChildNodes()){
NodeList nl = node.getChildNodes();
int nLen = nl.getLength();
for (int i = 0; i < nLen; i++ ) {
Node child=nl.item(i);
if (child.getNodeType()==Node.ELEMENT_NODE){
if (child.getNodeName().equals(XMLString.STYLE_STYLE)){
String sFamily = Misc.getAttribute(child,XMLString.STYLE_FAMILY);
if ("text".equals(sFamily)){
text.loadStyleFromDOM(child,bAutomatic);
}
else if ("paragraph".equals(sFamily)){
par.loadStyleFromDOM(child,bAutomatic && !bAllParagraphsAreSoft);
}
else if ("section".equals(sFamily)){
section.loadStyleFromDOM(child,bAutomatic);
}
else if ("table".equals(sFamily)){
table.loadStyleFromDOM(child,bAutomatic);
}
else if ("table-column".equals(sFamily)){
column.loadStyleFromDOM(child,bAutomatic);
}
else if ("table-row".equals(sFamily)){
row.loadStyleFromDOM(child,bAutomatic);
}
else if ("table-cell".equals(sFamily)){
cell.loadStyleFromDOM(child,bAutomatic);
}
else if ("graphics".equals(sFamily)){
frame.loadStyleFromDOM(child,bAutomatic);
}
else if ("graphic".equals(sFamily)){ // oasis
frame.loadStyleFromDOM(child,bAutomatic);
}
else if ("presentation".equals(sFamily)){
presentation.loadStyleFromDOM(child,bAutomatic);
}
else if ("drawing-page".equals(sFamily)){
// Bug in OOo 1.x: The same name may be used for a real and an automatic style...
if (drawingPage.getStyle(Misc.getAttribute(child,XMLString.STYLE_NAME))==null) {
drawingPage.loadStyleFromDOM(child,bAutomatic);
}
}
}
else if (child.getNodeName().equals(XMLString.STYLE_PAGE_MASTER)) { // old
pageLayout.loadStyleFromDOM(child,bAutomatic);
}
else if (child.getNodeName().equals(XMLString.STYLE_PAGE_LAYOUT)) { // oasis
pageLayout.loadStyleFromDOM(child,bAutomatic);
}
else if (child.getNodeName().equals(XMLString.STYLE_MASTER_PAGE)) {
masterPage.loadStyleFromDOM(child,bAutomatic);
if (firstMasterPage==null) {
firstMasterPage = (MasterPage) masterPage.getStyle(Misc.getAttribute(child,XMLString.STYLE_NAME));
}
}
else if (child.getNodeName().equals(XMLString.TEXT_LIST_STYLE)) {
list.loadStyleFromDOM(child,bAutomatic);
}
else if (child.getNodeName().equals(XMLString.TEXT_OUTLINE_STYLE)) {
outline.loadStyleFromDOM(child);
}
else if (child.getNodeName().equals(XMLString.STYLE_DEFAULT_STYLE)){
String sFamily = Misc.getAttribute(child,XMLString.STYLE_FAMILY);
if ("paragraph".equals(sFamily)) {
StyleWithProperties defaultPar = new StyleWithProperties();
defaultPar.loadStyleFromDOM(child);
par.setDefaultStyle(defaultPar);
}
else if ("graphics".equals(sFamily) || "graphic".equals(sFamily)) { // oasis: no s
StyleWithProperties defaultFrame = new StyleWithProperties();
defaultFrame.loadStyleFromDOM(child);
frame.setDefaultStyle(defaultFrame);
}
else if ("table-cell".equals(sFamily)) {
StyleWithProperties defaultCell = new StyleWithProperties();
defaultCell.loadStyleFromDOM(child);
cell.setDefaultStyle(defaultCell);
}
}
}
}
}
}
private void loadStylesFromDOM(Document stylesDOM, Document contentDOM, boolean bAllParagraphsAreSoft){
// Flat xml: stylesDOM will be null and contentDOM contain everything
// This is only the case for old versions of xmerge; newer versions
// creates DOM for styles, content, meta and settings.
NodeList list;
// font declarations: Try old format first
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.OFFICE_FONT_DECLS);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_FONT_DECLS);
}
// If that fails, try oasis format
if (list.getLength()==0) {
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.OFFICE_FONT_FACE_DECLS);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_FONT_FACE_DECLS);
}
}
if (list.getLength()!=0) {
Node node = list.item(0);
if (node.hasChildNodes()){
NodeList nl = node.getChildNodes();
int nLen = nl.getLength();
for (int i = 0; i < nLen; i++ ) {
Node child = nl.item(i);
if (child.getNodeType()==Node.ELEMENT_NODE){
if (child.getNodeName().equals(XMLString.STYLE_FONT_DECL)){
font.loadStyleFromDOM(child,false);
}
else if (child.getNodeName().equals(XMLString.STYLE_FONT_FACE)){
font.loadStyleFromDOM(child,false);
}
}
}
}
}
// soft formatting:
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.OFFICE_STYLES);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_STYLES);
}
if (list.getLength()!=0) {
loadStylesFromDOM(list.item(0),bAllParagraphsAreSoft);
}
// master styles:
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.OFFICE_MASTER_STYLES);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_MASTER_STYLES);
}
if (list.getLength()!=0) {
loadStylesFromDOM(list.item(0),bAllParagraphsAreSoft);
}
// hard formatting:
// Load from styles.xml first. Problem: There may be name clashes
// with automatic styles from content.xml
if (stylesDOM!=null) {
list = stylesDOM.getElementsByTagName(XMLString.OFFICE_AUTOMATIC_STYLES);
if (list.getLength()!=0) {
loadStylesFromDOM(list.item(0),bAllParagraphsAreSoft);
}
}
list = contentDOM.getElementsByTagName(XMLString.OFFICE_AUTOMATIC_STYLES);
if (list.getLength()!=0) {
loadStylesFromDOM(list.item(0),bAllParagraphsAreSoft);
}
// footnotes configuration:
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.TEXT_FOOTNOTES_CONFIGURATION);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.TEXT_FOOTNOTES_CONFIGURATION);
}
if (list.getLength()!=0) {
footnotes = new PropertySet();
footnotes.loadFromDOM(list.item(0));
}
// endnotes configuration:
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.TEXT_ENDNOTES_CONFIGURATION);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.TEXT_ENDNOTES_CONFIGURATION);
}
if (list.getLength()!=0) {
endnotes = new PropertySet();
endnotes.loadFromDOM(list.item(0));
}
// if it failed, try oasis format
if (footnotes==null || endnotes==null) {
if (stylesDOM==null) {
list = contentDOM.getElementsByTagName(XMLString.TEXT_NOTES_CONFIGURATION);
}
else {
list = stylesDOM.getElementsByTagName(XMLString.TEXT_NOTES_CONFIGURATION);
}
int nLen = list.getLength();
for (int i=0; i