Source code for abbyy_to_epub3.parse_abbyy

# Copyright 2017 Deborah Kaplan
#
# This file is part of Abbyy-to-epub3.
# Source code is available at <https://github.com/deborahgu/abbyy-to-epub3>.
#
# Abbyy-to-epub3 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from ebooklib import utils as ebooklibutils
from lxml import etree

import gc
import logging
import pycountry
import re

from abbyy_to_epub3 import constants
from abbyy_to_epub3.utils import fast_iter, gettext, sanitize_xml


[docs]def add_last_text(blocks, page): """ Given a list of blocks and the page number of the last page in the list, mark up the last text block for that page in the list, if it exists. """ while len(blocks) >= 1: # Look for a page number in the last block our list elem = blocks[-1] if 'page_no' not in elem: return # return if we reached the previous page without hitting text if elem['page_no'] <= page: return # If page_no is here and is text, set elem to 'last' if elem['page_no'] == page: if 'type' in elem and elem['type'] == 'text': elem['last'] = True return # redo loop with the list truncated by final element blocks = blocks[:-1] continue
[docs]class AbbyyParser(object): """ The ABBYY parser object. Parses ABBYY metadata in preparation for import into an EPUB 3 document. And ABBYY document begins with a font and style information: .. code:: html <documentData> <paragraphStyles> <paragraphStyle id="{idnum}" name="stylename" mainFontStyleId="{idnum}" [style info]> <fontStyle id="{idnum}" [style info]> </paragraphStyle> [more styles] </documentData> This is followed by the data for the pages. .. code:: html <page> <block></block> [more blocks] </page> Blocks have types. We process types Text, Picture, and Table. Text: .. code:: html <page> <region> <text> contains a '\\n' as a text element <par> The paragraph, repeatable <line> The line, repeatable <formatting> <charParams>: The individual character Picture: we know the corresponding scan (page) number, & coordinates. Table: .. code:: html <row> <cell> <text> <par> Each `<par>` has an identifier, which has a unique style, including the paragraph's role, eg: .. code:: html <par align="Right" lineSpacing="1790" style="{000000DD-016F-0A36-032F-EEBBD9B8571E}"> This corresponds to a paragraphStyle from the `<documentData>` element: .. code:: html <paragraphStyle id="{000000DD-016F-0A36-032F-EEBBD9B8571E}" name="Heading #1|1" mainFontStyleId="{000000DE-016F-0A37-032F-176E5F6405F5}" role="heading" roleLevel="1" [style information]> The roles map as follows: ================= ============== Role name role ================= ============== Body text text Footnote footnote Header or footer rt Heading heading Other other Table caption tableCaption Table of contents contents ================= ============== """ # Set these once we start parsing the tree and know our schema ns = '' nsm = '' version = '' etree = '' def __init__( self, document, metadata_file, metadata, paragraphs, blocks, debug=False ): self.logger = logging.getLogger(__name__) if debug: self.logger.addHandler(logging.StreamHandler()) self.logger.setLevel(logging.DEBUG) self.document = document self.metadata_file = metadata_file self.metadata = metadata self.paragraphs = paragraphs self.blocks = blocks self.page_no = 0 # Save page numbers only if using a supporting version of ebooklib if 'create_pagebreak' in dir(ebooklibutils): self.metadata['PAGES_SUPPORT'] = True else: self.metadata['PAGES_SUPPORT'] = False
[docs] def is_block_type(self, blockattr, blocktype): """ Identifies if a block has the given type. """ if 'blockType' in blockattr and blockattr['blockType'] == blocktype: return True else: return False
[docs] def find_namespace(self): """ find the namespace of an XML document. Assumes that the namespace of the first element in the context is the namespace we need. This is more memory-efficient then parsing the entire tree to get the root node. """ context = etree.iterparse(self.document, events=('start',),) for event, elem in context: # Namespace depends on finereader version. # We can parse FR6 schema, a little if not self.version: abbyy_nsm = elem.nsmap if constants.ABBYY_NS in abbyy_nsm.values(): self.nsm = constants.ABBYY_NSM self.ns = constants.ABBYY_NS self.version = "FR10" elif constants.OLD_NS in abbyy_nsm.values(): self.nsm = constants.OLD_NSM self.ns = constants.OLD_NS self.version = "FR6" else: raise RuntimeError("Input XML not in a supported schema.") self.logger.debug("FineReader Version {}".format(self.version)) self.metadata['fr-version'] = self.version else: return
[docs] def parse_metadata(self): """ Parse out the metadata from the _meta.xml file """ tree = etree.parse(self.metadata_file) root = tree.getroot() terms = root.iterchildren() for term in terms: if term.tag in self.metadata: self.metadata[term.tag].append(term.text) else: self.metadata[term.tag] = [term.text, ] # if the language isn't explicitly set, assume English # if the language code is invalid, assume English # language might be ISO 639-6, ISO 639-2/B, ISO 639-2/T, or ISO 639-1 # (in pycountry, called: name, alpha_3, bibliographic, and alpha_2) if 'language' not in self.metadata: self.metadata['language'] = ['en'] else: lang_code = self.metadata['language'][0] try: lang = pycountry.languages.lookup(lang_code) self.metadata['language'][0] = ( getattr(lang, 'alpha_2', None) or getattr(lang, 'alpha_3', None) ) except LookupError: self.logger.debug( "Invalid language code {}. Setting to English".format( lang_code ) ) self.metadata['language'][0] = 'en'
[docs] def parse_abbyy(self): """ Parse the ABBYY into a format useful for `create_epub`. Process the the elements we will need to construct the EPUB: `paragraphStyle`, `fontStyle`, and `page`. We traverse the entire tree twice with `iterparse`, because lxml builds the whole node tree in memory for even tag-selective `iterparse`, & if we don't traverse the whole tree, we can't delete the unowned nodes. `fast_iter` makes the process speedy, and the dual processing saves on memory. Because of the layout of the elements in the ABBYY file, it's too complex to do this in a single iterative pass. """ # some basic initialization self.metadata['pics_by_page'] = dict() self.fontStyles = dict() self.pages = [] # Be aggressive with garbage collection; parsing the XML hogs memory gc.set_threshold(1, 1, 1) # Get the namespace & the FR version, so we can find the other elements self.find_namespace() self.logger.debug("Beginning iterparse") # paragraphStyle is a prerequisite for page context = etree.iterparse( self.document, events=('end',), ) self.logger.debug("fast_iter on process_styles") fast_iter(context, self.process_styles) del context # Because of the processing order of XML events, it's efficient # to collect para and font styles upfront & collate it after. for id, attribs in self.paragraphs.items(): if ( 'mainFontStyleId' in attribs and 'mainFontStyleId' in self.fontStyles ): self.paragraphs[id]['fontstyle'] = self.fontStyles[ 'mainFontStyleId' ] # parse the metadata document next self.logger.debug("parse_metadata") self.parse_metadata() # finally, extract the individual page elements from the XML self.logger.debug("Beginning iterparse on pages") context = etree.iterparse( self.document, events=('end',), tag="{{{}}}page".format(self.ns), ) self.logger.debug("fast_iter on process_pages") fast_iter(context, self.process_pages) del context # if we don't clear the list, the page elements will stick around # even after the list's scope has vanished, leaking memory self.pages.clear()
[docs] def process_styles(self, elem): """ Iteratively parse styles from the ABBYY file into data structures. The ABBYY seems to be sometimes inconsistent about whether these elements have a namespace, so be forgiving. """ if ( elem.tag == "{{{}}}paragraphStyle".format(self.ns) or elem.tag == "paragraphStyle" ): """ Paragraph styles are on their own at the start of the ABBYY and contain child fontStyle elements """ self.paragraphs[elem.get("id")] = dict(elem.attrib) fontstyles = elem.iterchildren() for fontstyle in fontstyles: self.fontStyles[fontstyle.get("id")] = dict(fontstyle.attrib)
[docs] def process_pages(self, elem): """ Iteratively process pages from the ABBYY file. We have to process now rather than copying the pages for later processing, because deepcopying an lxml element replicates the entire tree. The ABBYY seems to be sometimes inconsistent about whether these elements have a namespace, so be forgiving. """ if ( elem.tag == "{{{}}}page".format(self.ns) or elem.tag == "page" ): self.pagewidth = elem.get('width') self.pageheight = elem.get('height') block_per_page = elem.iterchildren() if not block_per_page: self.page_no += 1 return self.newpage = True # Most pages have multiple `<block>` elements for block in block_per_page: self.parse_block(block) # Mark up the last text block on the page, if there is one add_last_text(self.blocks, self.page_no) # For accessibility, create a page number at the end of every page # with content. if ( self.metadata['PAGES_SUPPORT'] and len(self.blocks) > 1 and self.blocks[-1]['type'] != 'Page' ): self.blocks.append({ 'type': 'Page', 'text': self.page_no, }) # Set up the next iteration. self.page_no += 1
[docs] def parse_block(self, block): """ Parse a single block on the page. """ blockattr = block.attrib blockattr['pagewidth'] = self.pagewidth blockattr['pageheight'] = self.pageheight if self.is_block_type(blockattr, "Text"): paras = block.iterdescendants( tag="{{{}}}par".format(self.ns) ) # Some blocks can have multiple styles in them. We'll treat # those as multiple blocks. for para in paras: # Get the paragraph style and text para_id = para.get("style") if para_id not in self.paragraphs: self.logger.debug( 'Block {} has no paragraphStyle'.format( para_id ) ) self.paragraphs[para_id] = dict() # Preserve line breaks so we can strip EOL hyphens and pad # whitespace at line endings # newlines lines = para.iterdescendants( tag="{{{}}}line".format(self.ns) ) text = '' for line in lines: linetext = gettext(line).strip() # replace EOL hyphens with whitespace stripping # add whitespace at end of all other lines text_clean = sanitize_xml(linetext) text_pad = re.sub(r'([^¬-])\Z', r'\1 ', text_clean) text_complete = re.sub(r'[¬-]\s*\Z', r'', text_pad) text += text_complete line.clear() del(lines) # Ignore whitespace-only pars if not text: continue # Get the paragraph role # FR6 docs have no structure, styles, roles if self.version == "FR10": role = self.paragraphs[para_id]['role'] else: role = "FR6" # Skip headers and footers if role == 'rt': continue # This is a good text chunk. Instantiate the block. self.blocks.append({ 'type': 'Text', 'page_no': self.page_no, 'text': text, 'role': role, 'style': self.paragraphs[para_id], }) # To help with unmarked header recognition if self.newpage: self.blocks[-1]['first'] = True self.newpage = False # Mark up heading level if role == 'heading': level = self.paragraphs[para_id]['roleLevel'] # shortcut so we need fewer lookups later self.blocks[-1]['heading'] = level para.clear() # garbage collection del paras # garbage collection elif self.is_block_type(blockattr, "Table"): # We'll process the table by treating each of its cells' # subordinate blocks as separate. Keep track of which is the last # element in a cell/row/table, so we can close the elements after # each is complete. this_row = 1 self.blocks.append({ 'type': 'Table', 'style': blockattr, 'page_no': self.page_no, }) # Make the iterator into a list so we can calculate length # with only one iteration. Should be a small chunk so unlikely # to be a memory hog. rows = list( block.iterdescendants(tag="{{{}}}row".format(self.ns)) ) rows_in_table = len(rows) for row in rows: this_cell = 1 self.blocks.append({ 'type': 'TableRow', 'style': blockattr, 'page_no': self.page_no, }) if this_row == rows_in_table: self.blocks[-1]['last_table_elem'] = True this_row += 1 cells = list(row.iterdescendants( tag="{{{}}}cell".format(self.ns) )) cells_in_row = len(cells) for cell in cells: this_contents = 1 self.blocks.append({ 'type': 'TableCell', 'style': blockattr, 'page_no': self.page_no, }) if this_cell == cells_in_row: self.blocks[-1]['last_table_elem'] = True this_cell += 1 # Parsing a cell is not quite like parsing text. # The element layout is cell -> text -> par. text = cell.find("a:text", namespaces=self.nsm) paras = list(text.iterdescendants( tag="{{{}}}par".format(self.ns) )) paras_in_cell = len(paras) for para in paras: para_id = para.get("style") text = gettext(para).strip() # Ignore whitespace-only para unless it's # an empty cell. If so, placeholder if not text and paras_in_cell > 1: continue # Preserve line breaks to strip EOL hyphens & pad # whitespace at line endings lines = para.iterdescendants( tag="{{{}}}line".format(self.ns) ) text = '' for line in lines: linetext = gettext(line).strip() # replace EOL hyphens with whitespace stripping # add whitespace at end of all other lines text_clean = sanitize_xml(linetext) text_pad = re.sub(r'([^¬-])\Z', r'\1 ', text_clean) text_complete = re.sub(r'[¬-]\s*\Z', r'', text_pad) text += text_complete line.clear() del(lines) self.blocks.append({ 'type': 'TableText', 'style': blockattr, 'page_no': self.page_no, 'text': text, }) if this_contents == paras_in_cell: self.blocks[-1]['last_table_elem'] = True this_contents += 1 if self.newpage: self.newpage = False para.clear() del paras del text cell.clear() # garbage collection del cells # garbage collection row.clear() # garbage collection del rows # garbage collection else: # Create entry for non-text blocks with type & attributes d = { 'type': block.get("blockType"), 'style': blockattr, 'page_no': self.page_no, } self.blocks.append(d) # If this is an image, add it to a dict of all images # by page number, so we can strip out overlapping images if self.is_block_type(blockattr, "Picture"): if self.page_no in self.metadata['pics_by_page']: self.metadata['pics_by_page'].append(d) else: self.metadata['pics_by_page'] = [d, ] d = dict()