Source code for abbyy_to_epub3.parse_abbyy

# Copyright 2017 Deborah Kaplan
#
# This file is part of Abbyy-to-epub3.
# Source code is available at <https://github.com/deborahgu/abbyy-to-epub3>.
#
# Abbyy-to-epub3 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from ebooklib import epub
from ebooklib import utils as ebooklibutils
from lxml import etree

import logging
import sys

from abbyy_to_epub3 import constants
from abbyy_to_epub3.utils import sanitize_xml


[docs]def gettext(elem): text = elem.text or "" for e in elem: text += gettext(e) if e.tail: text += e.tail.strip() return text
[docs]def add_last_text(blocks, page): """ Given a list of blocks and the page number of the last page in the list, mark up the last text block for that page in the list, if it exists. """ elem = blocks[-1] if 'page_no' not in elem: # On a page_no element, so at end of previous page return if elem['page_no'] == page: if 'type' in elem and elem['type'] == 'text': elem['last'] = True elif len(blocks) > 1: add_last_text(blocks[:-1], page)
[docs]class AbbyyParser(object): """ The ABBYY parser object. Parses ABBYY metadata in preparation for import into an EPUB 3 document. Here are the components of the ABBYY schema we use: .. code:: html <page> <block>types Picture, Separator, Table, or Text</block> Text: .. code:: html <page> <region> <text> contains a '\\n' as a text element <par> The paragraph, repeatable <line> The line, repeatable <formatting> <charParams>: The individual character Image: Separator: Table: .. code:: html <row> <cell> <text> <par> Each paragraph has an identifier, which has a unique style, including the paragraph's role, eg: .. code:: html <paragraphStyle id="{000000DD-016F-0A36-032F-EEBBD9B8571E}" name="Heading #1|1" mainFontStyleId="{000000DE-016F-0A37-032F-176E5F6405F5}" role="heading" roleLevel="1" align="Right" startIndent="0" leftIndent="0" rightIndent="0" lineSpacing="1790" fixedLineSpacing="1"> <par align="Right" lineSpacing="1790" style="{000000DD-016F-0A36-032F-EEBBD9B8571E}"> The roles map as follows: ================= ============== Role name role ================= ============== Body text text Footnote footnote Header or footer rt Heading heading Other other Table caption tableCaption Table of contents contents ================= ============== """ # Set these once we start parsing the tree and know our schema ns = '' nsm = '' version = '' etree = '' def __init__(self, document, metadata_file, metadata, paragraphs, blocks, debug=False): self.logger = logging.getLogger(__name__) if debug: self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.DEBUG) self.document = document self.metadata_file = metadata_file self.metadata = metadata self.paragraphs = paragraphs self.blocks = blocks # Save page numbers only if using a supporting version of ebooklib if 'create_pagebreak' in dir(ebooklibutils): self.PAGES_SUPPORT = True else: self.PAGES_SUPPORT = False
[docs] def is_block_type(self, elem, blocktype): """ Identifies if an XML element is a textblock. """ if ( elem.tag == "{{{}}}block".format(self.ns) and elem.get("blockType") == blocktype ): return True else: return False
[docs] def parse_abbyy(self): """ read the ABBYY file into an lxml etree """ self.tree = etree.parse(self.document) # We can parse FR6 schema, a little abbyy_nsm = self.tree.getroot().nsmap if constants.ABBYY_NS in abbyy_nsm.values(): self.nsm = constants.ABBYY_NSM self.ns = constants.ABBYY_NS self.version = "FR10" elif constants.OLD_NS in abbyy_nsm.values(): self.nsm = constants.OLD_NSM self.ns = constants.OLD_NS self.version = "FR6" else: raise RuntimeError("Input XML document is not a supported schema.") self.logger.debug("Version {}".format(self.version)) self.metadata['fr-version'] = self.version self.metadata['pics_by_page'] = dict() self.parse_metadata() self.parse_paragraph_styles() self.parse_content()
[docs] def parse_paragraph_styles(self): """ Paragraph styles are on their own at the start of the ABBYY """ styles = self.tree.findall(".//a:paragraphStyle", namespaces=self.nsm) fontstyles = self.tree.findall(".//a:fontStyle", namespaces=self.nsm) for style in styles: id = style.get("id") self.paragraphs[id] = dict(style.attrib) if 'mainFontStyleId' in style.attrib: for fstyle in fontstyles: if fstyle.get("id") == style.attrib['mainFontStyleId']: self.paragraphs[id]['fontstyle'] = dict(fstyle.attrib) break
[docs] def parse_content(self): """ Parse each page of the book. """ page_no = 1 d = {'page_no': page_no} pages = self.tree.findall(".//a:page", namespaces=self.nsm) pages.pop(0) # ignore the calibration page for page in pages: pagewidth = page.get('width') pageheight = page.get('height') block_per_page = page.getchildren() if not block_per_page: page_no += 1 continue newpage = True for block in block_per_page: blockattr = block.attrib blockattr['pagewidth'] = pagewidth blockattr['pageheight'] = pageheight if self.is_block_type(block, "Text"): paras = block.findall(".//a:par", namespaces=self.nsm) # Some blocks can have multiple styles in them. We'll treat # those as multiple blocks. for para in paras: # Get the paragraph style and text para_id = para.get("style") if para_id not in self.paragraphs: self.logger.info( 'The block with the ID {} has no corresponding paragraphStyle'.format( para_id ) ) self.paragraphs[para_id] = dict() text = gettext(para).strip() # Ignore whitespace-only pars if not text: continue # Get the paragraph role # FR6 docs have no structure, styles, roles if self.version == "FR10": role = self.paragraphs[para_id]['role'] else: role = "FR6" # Skip headers and footers if role == 'rt': continue # This is a good text chunk. Instantiate the block. d = { 'type': 'Text', 'page_no': page_no, 'text': sanitize_xml(text), 'role': role, 'style': self.paragraphs[para_id] } # To help with unmarked header recognition if newpage: d['first'] = True newpage = False # Mark up heading level if role == 'heading': level = self.paragraphs[para_id]['roleLevel'] # shortcut so we need fewer lookups later d['heading'] = level # Whenever you append to the list, re-instantiate self.blocks.append(d) d = dict() elif self.is_block_type(block, "Table"): # We'll process the table by treating each of its cells # subordinate blocks as separate. Keep track of which # is the last element in a cell/row/table, so we can # close the elements after each is complete. this_row = 1 d = { 'type': 'Table', 'style': blockattr, 'page_no': page_no, } self.blocks.append(d) d = dict() rows = block.findall(".//a:row", namespaces=self.nsm) rows_in_table = len(rows) for row in rows: this_cell = 1 d = { 'type': 'TableRow', 'style': blockattr, 'page_no': page_no, } if this_row == rows_in_table: d['last_table_elem'] = True this_row += 1 self.blocks.append(d) d = dict() cells = row.findall("a:cell", namespaces=self.nsm) cells_in_row = len(cells) for cell in cells: this_contents = 1 d = { 'type': 'TableCell', 'style': blockattr, 'page_no': page_no, } if this_cell == cells_in_row: d['last_table_elem'] = True this_cell += 1 self.blocks.append(d) d = dict() # Parsing a cell is not quite like parsing regular # text. # The layout is cell -> text -> par. text = cell.find("a:text", namespaces=self.nsm) paras = text.findall("a:par", namespaces=self.nsm) paras_in_cell = len(paras) for para in paras: para_id = para.get("style") text = gettext(para).strip() # Ignore whitespace-only para unless it's # an empty cell. If so, placeholder if not text and len(paras) > 1: continue d = { 'type': 'TableText', 'style': blockattr, 'page_no': page_no, 'text': sanitize_xml(text), } if this_contents == paras_in_cell: d['last_table_elem'] = True this_contents += 1 self.blocks.append(d) d = dict() if newpage: newpage = False else: # Create an entry for non-text blocks with type & attributes d = { 'type': block.get("blockType"), 'style': blockattr, 'page_no': page_no, } self.blocks.append(d) # If this is an image, add it to a dict of all images # by page number, so we can strip out overlapping images if self.is_block_type(block, "Picture"): if page_no in self.metadata['pics_by_page']: self.metadata['pics_by_page'].append(d) else: self.metadata['pics_by_page'] = [d, ] d = dict() # Mark up the last text block on the page, if there is one add_last_text(self.blocks, page_no) # For accessibility, create a page number at the end of every page if self.PAGES_SUPPORT: d = { 'type': 'Page', 'text': page_no, } self.blocks.append(d) d = dict() # Set up the next iteration. page_no += 1
[docs] def parse_metadata(self): """ Parse out the metadata from the _meta.xml file """ tree = etree.parse(self.metadata_file) root = tree.getroot() terms = root.iterchildren() for term in terms: if term.tag in self.metadata: self.metadata[term.tag].append(term.text) else: self.metadata[term.tag] = [term.text, ]