# Copyright 2017 Deborah Kaplan
#
# This file is part of Abbyy-to-epub3.
# Source code is available at <https://github.com/deborahgu/abbyy-to-epub3>.
#
# Abbyy-to-epub3 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from ebooklib import utils as ebooklibutils
from lxml import etree
import gc
import logging
import pycountry
import re
from abbyy_to_epub3 import constants
from abbyy_to_epub3.utils import fast_iter, gettext, sanitize_xml
[docs]def add_last_text(blocks, page):
"""
Given a list of blocks and the page number of the last page in the list,
mark up the last text block for that page in the list, if it exists.
"""
while len(blocks) >= 1:
# Look for a page number in the last block our list
elem = blocks[-1]
if 'page_no' not in elem:
return
# return if we reached the previous page without hitting text
if elem['page_no'] <= page:
return
# If page_no is here and is text, set elem to 'last'
if elem['page_no'] == page:
if 'type' in elem and elem['type'] == 'text':
elem['last'] = True
return
# redo loop with the list truncated by final element
blocks = blocks[:-1]
continue
[docs]class AbbyyParser(object):
"""
The ABBYY parser object.
Parses ABBYY metadata in preparation for import into an EPUB 3 document.
And ABBYY document begins with a font and style information:
.. code:: html
<documentData>
<paragraphStyles>
<paragraphStyle
id="{idnum}" name="stylename"
mainFontStyleId="{idnum}" [style info]>
<fontStyle id="{idnum}" [style info]>
</paragraphStyle>
[more styles]
</documentData>
This is followed by the data for the pages.
.. code:: html
<page>
<block></block>
[more blocks]
</page>
Blocks have types. We process types Text, Picture, and Table.
Text:
.. code:: html
<page>
<region>
<text> contains a '\\n' as a text element
<par> The paragraph, repeatable
<line> The line, repeatable
<formatting>
<charParams>: The individual character
Picture: we know the corresponding scan (page) number, & coordinates.
Table:
.. code:: html
<row>
<cell>
<text>
<par>
Each `<par>` has an identifier, which has a unique style, including
the paragraph's role, eg:
.. code:: html
<par align="Right" lineSpacing="1790"
style="{000000DD-016F-0A36-032F-EEBBD9B8571E}">
This corresponds to a paragraphStyle from the `<documentData>` element:
.. code:: html
<paragraphStyle
id="{000000DD-016F-0A36-032F-EEBBD9B8571E}"
name="Heading #1|1"
mainFontStyleId="{000000DE-016F-0A37-032F-176E5F6405F5}"
role="heading" roleLevel="1"
[style information]>
The roles map as follows:
================= ==============
Role name role
================= ==============
Body text text
Footnote footnote
Header or footer rt
Heading heading
Other other
Table caption tableCaption
Table of contents contents
================= ==============
"""
# Set these once we start parsing the tree and know our schema
ns = ''
nsm = ''
version = ''
etree = ''
def __init__(
self, document, metadata_file, metadata,
paragraphs, blocks, debug=False
):
self.logger = logging.getLogger(__name__)
if debug:
self.logger.addHandler(logging.StreamHandler())
self.logger.setLevel(logging.DEBUG)
self.document = document
self.metadata_file = metadata_file
self.metadata = metadata
self.paragraphs = paragraphs
self.blocks = blocks
self.page_no = 0
# Save page numbers only if using a supporting version of ebooklib
if 'create_pagebreak' in dir(ebooklibutils):
self.metadata['PAGES_SUPPORT'] = True
else:
self.metadata['PAGES_SUPPORT'] = False
[docs] def is_block_type(self, blockattr, blocktype):
""" Identifies if a block has the given type. """
if 'blockType' in blockattr and blockattr['blockType'] == blocktype:
return True
else:
return False
[docs] def find_namespace(self):
"""
find the namespace of an XML document. Assumes that the namespace of
the first element in the context is the namespace we need. This is more
memory-efficient then parsing the entire tree to get the root node.
"""
context = etree.iterparse(self.document, events=('start',),)
for event, elem in context:
# Namespace depends on finereader version.
# We can parse FR6 schema, a little
if not self.version:
abbyy_nsm = elem.nsmap
if constants.ABBYY_NS in abbyy_nsm.values():
self.nsm = constants.ABBYY_NSM
self.ns = constants.ABBYY_NS
self.version = "FR10"
elif constants.OLD_NS in abbyy_nsm.values():
self.nsm = constants.OLD_NSM
self.ns = constants.OLD_NS
self.version = "FR6"
else:
raise RuntimeError("Input XML not in a supported schema.")
self.logger.debug("FineReader Version {}".format(self.version))
self.metadata['fr-version'] = self.version
else:
return
[docs] def parse_abbyy(self):
"""
Parse the ABBYY into a format useful for `create_epub`. Process the
the elements we will need to construct the EPUB: `paragraphStyle`,
`fontStyle`, and `page`. We traverse the entire tree twice with
`iterparse`, because lxml builds the whole node tree in memory for even
tag-selective `iterparse`, & if we don't traverse the whole tree, we
can't delete the unowned nodes. `fast_iter` makes the process speedy,
and the dual processing saves on memory. Because of the layout
of the elements in the ABBYY file, it's too complex to do this in a
single iterative pass.
"""
# some basic initialization
self.metadata['pics_by_page'] = dict()
self.fontStyles = dict()
self.pages = []
# Be aggressive with garbage collection; parsing the XML hogs memory
gc.set_threshold(1, 1, 1)
# Get the namespace & the FR version, so we can find the other elements
self.find_namespace()
self.logger.debug("Beginning iterparse")
# paragraphStyle is a prerequisite for page
context = etree.iterparse(
self.document,
events=('end',),
)
self.logger.debug("fast_iter on process_styles")
fast_iter(context, self.process_styles)
del context
# Because of the processing order of XML events, it's efficient
# to collect para and font styles upfront & collate it after.
for id, attribs in self.paragraphs.items():
if (
'mainFontStyleId' in attribs and
'mainFontStyleId' in self.fontStyles
):
self.paragraphs[id]['fontstyle'] = self.fontStyles[
'mainFontStyleId'
]
# parse the metadata document next
self.logger.debug("parse_metadata")
self.parse_metadata()
# finally, extract the individual page elements from the XML
self.logger.debug("Beginning iterparse on pages")
context = etree.iterparse(
self.document,
events=('end',),
tag="{{{}}}page".format(self.ns),
)
self.logger.debug("fast_iter on process_pages")
fast_iter(context, self.process_pages)
del context
# if we don't clear the list, the page elements will stick around
# even after the list's scope has vanished, leaking memory
self.pages.clear()
[docs] def process_styles(self, elem):
"""
Iteratively parse styles from the ABBYY file into data structures.
The ABBYY seems to be sometimes inconsistent about whether these
elements have a namespace, so be forgiving.
"""
if (
elem.tag == "{{{}}}paragraphStyle".format(self.ns) or
elem.tag == "paragraphStyle"
):
"""
Paragraph styles are on their own at the start of the ABBYY
and contain child fontStyle elements
"""
self.paragraphs[elem.get("id")] = dict(elem.attrib)
fontstyles = elem.iterchildren()
for fontstyle in fontstyles:
self.fontStyles[fontstyle.get("id")] = dict(fontstyle.attrib)
[docs] def process_pages(self, elem):
"""
Iteratively process pages from the ABBYY file. We have to process now
rather than copying the pages for later processing, because deepcopying
an lxml element replicates the entire tree.
The ABBYY seems to be sometimes inconsistent about whether these
elements have a namespace, so be forgiving.
"""
if (
elem.tag == "{{{}}}page".format(self.ns) or
elem.tag == "page"
):
self.pagewidth = elem.get('width')
self.pageheight = elem.get('height')
block_per_page = elem.iterchildren()
if not block_per_page:
self.page_no += 1
return
self.newpage = True
# Most pages have multiple `<block>` elements
for block in block_per_page:
self.parse_block(block)
# Mark up the last text block on the page, if there is one
add_last_text(self.blocks, self.page_no)
# For accessibility, create a page number at the end of every page
# with content.
if (
self.metadata['PAGES_SUPPORT'] and
len(self.blocks) > 1 and
self.blocks[-1]['type'] != 'Page'
):
self.blocks.append({
'type': 'Page',
'text': self.page_no,
})
# Set up the next iteration.
self.page_no += 1
[docs] def parse_block(self, block):
""" Parse a single block on the page. """
blockattr = block.attrib
blockattr['pagewidth'] = self.pagewidth
blockattr['pageheight'] = self.pageheight
if self.is_block_type(blockattr, "Text"):
paras = block.iterdescendants(
tag="{{{}}}par".format(self.ns)
)
# Some blocks can have multiple styles in them. We'll treat
# those as multiple blocks.
for para in paras:
# Get the paragraph style and text
para_id = para.get("style")
if para_id not in self.paragraphs:
self.logger.debug(
'Block {} has no paragraphStyle'.format(
para_id
)
)
self.paragraphs[para_id] = dict()
# Preserve line breaks so we can strip EOL hyphens and pad
# whitespace at line endings
# newlines
lines = para.iterdescendants(
tag="{{{}}}line".format(self.ns)
)
text = ''
for line in lines:
linetext = gettext(line).strip()
# replace EOL hyphens with whitespace stripping
# add whitespace at end of all other lines
text_clean = sanitize_xml(linetext)
text_pad = re.sub(r'([^¬-])\Z', r'\1 ', text_clean)
text_complete = re.sub(r'[¬-]\s*\Z', r'', text_pad)
text += text_complete
line.clear()
del(lines)
# Ignore whitespace-only pars
if not text:
continue
# Get the paragraph role
# FR6 docs have no structure, styles, roles
if self.version == "FR10":
role = self.paragraphs[para_id]['role']
else:
role = "FR6"
# Skip headers and footers
if role == 'rt':
continue
# This is a good text chunk. Instantiate the block.
self.blocks.append({
'type': 'Text',
'page_no': self.page_no,
'text': text,
'role': role,
'style': self.paragraphs[para_id],
})
# To help with unmarked header recognition
if self.newpage:
self.blocks[-1]['first'] = True
self.newpage = False
# Mark up heading level
if role == 'heading':
level = self.paragraphs[para_id]['roleLevel']
# shortcut so we need fewer lookups later
self.blocks[-1]['heading'] = level
para.clear() # garbage collection
del paras # garbage collection
elif self.is_block_type(blockattr, "Table"):
# We'll process the table by treating each of its cells'
# subordinate blocks as separate. Keep track of which is the last
# element in a cell/row/table, so we can close the elements after
# each is complete.
this_row = 1
self.blocks.append({
'type': 'Table',
'style': blockattr,
'page_no': self.page_no,
})
# Make the iterator into a list so we can calculate length
# with only one iteration. Should be a small chunk so unlikely
# to be a memory hog.
rows = list(
block.iterdescendants(tag="{{{}}}row".format(self.ns))
)
rows_in_table = len(rows)
for row in rows:
this_cell = 1
self.blocks.append({
'type': 'TableRow',
'style': blockattr,
'page_no': self.page_no,
})
if this_row == rows_in_table:
self.blocks[-1]['last_table_elem'] = True
this_row += 1
cells = list(row.iterdescendants(
tag="{{{}}}cell".format(self.ns)
))
cells_in_row = len(cells)
for cell in cells:
this_contents = 1
self.blocks.append({
'type': 'TableCell',
'style': blockattr,
'page_no': self.page_no,
})
if this_cell == cells_in_row:
self.blocks[-1]['last_table_elem'] = True
this_cell += 1
# Parsing a cell is not quite like parsing text.
# The element layout is cell -> text -> par.
text = cell.find("a:text", namespaces=self.nsm)
paras = list(text.iterdescendants(
tag="{{{}}}par".format(self.ns)
))
paras_in_cell = len(paras)
for para in paras:
para_id = para.get("style")
text = gettext(para).strip()
# Ignore whitespace-only para unless it's
# an empty cell. If so, placeholder
if not text and paras_in_cell > 1:
continue
# Preserve line breaks to strip EOL hyphens & pad
# whitespace at line endings
lines = para.iterdescendants(
tag="{{{}}}line".format(self.ns)
)
text = ''
for line in lines:
linetext = gettext(line).strip()
# replace EOL hyphens with whitespace stripping
# add whitespace at end of all other lines
text_clean = sanitize_xml(linetext)
text_pad = re.sub(r'([^¬-])\Z', r'\1 ', text_clean)
text_complete = re.sub(r'[¬-]\s*\Z', r'', text_pad)
text += text_complete
line.clear()
del(lines)
self.blocks.append({
'type': 'TableText',
'style': blockattr,
'page_no': self.page_no,
'text': text,
})
if this_contents == paras_in_cell:
self.blocks[-1]['last_table_elem'] = True
this_contents += 1
if self.newpage:
self.newpage = False
para.clear()
del paras
del text
cell.clear() # garbage collection
del cells # garbage collection
row.clear() # garbage collection
del rows # garbage collection
else:
# Create entry for non-text blocks with type & attributes
d = {
'type': block.get("blockType"),
'style': blockattr,
'page_no': self.page_no,
}
self.blocks.append(d)
# If this is an image, add it to a dict of all images
# by page number, so we can strip out overlapping images
if self.is_block_type(blockattr, "Picture"):
if self.page_no in self.metadata['pics_by_page']:
self.metadata['pics_by_page'].append(d)
else:
self.metadata['pics_by_page'] = [d, ]
d = dict()