Source code for abbyy_to_epub3.create_epub

# Copyright 2017 Deborah Kaplan
#
# This file is part of Abbyy-to-epub3.
# Source code is available at <https://github.com/deborahgu/abbyy-to-epub3>.
#
# Abbyy-to-epub3 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from collections import OrderedDict
from ebooklib import epub
from ebooklib import utils as ebooklib_utils
from fuzzywuzzy import fuzz
from numeral import roman2int
from pkg_resources import resource_filename

from zipfile import BadZipFile, ZipFile

import configparser
import gzip
import logging
import os
import sys
import re
import subprocess
import tempfile

from abbyy_to_epub3 import __version__
from abbyy_to_epub3.constants import skippable_pages
from abbyy_to_epub3.parse_abbyy import AbbyyParser
from abbyy_to_epub3.image_processing import factory as ImageFactory
from abbyy_to_epub3.parse_scandata import ScandataParser
from abbyy_to_epub3.utils import dirtify_xml, is_increasing
from abbyy_to_epub3.verify_epub import EpubVerify


# Set up configuration
config = configparser.ConfigParser()
configfile = resource_filename("abbyy_to_epub3", "config.ini")
config.read(configfile)

ERR_MISSING_SCANDATA = 3

[docs]class ArchiveBookItem(object):
    """Archive.org is a website which contains an archive of items
    composed of archived digital content. Archive.org items are
    distributed across a cluster of machines called datanodes. In
    order to access the files of an item, you need to know 4 things:

    a) The Archive.org `item_identifier` (the unique ID of this item)
       e.g. https://archive.org/details/{item_identifier}
    b) the datanode server address which hosts this item
    c) the `item_dir` which is the file path on this datanode where
       this items files are kept
    d) the name of the files within this `item_dir`

    Certain archive.org items are specifically structured (file
    organizations, contents, names) to store and play Books. Every
    Archive Book Item contains the following files:
    - a jp2.zip containing all the scanned images of the book
    - an abbyy file containing the OCR'd plaintest of these scans
    - scandata.xml whose metadata describes the structure of the book
      (metadata, pages numbers)
    - meta.xml which describes the entire archive.org *item*

    A complication is that Archive.org Book Items may contain 1 or
    more books. In order to accommodate this subtlety and delineate
    between books, an `item_dir` and `item_identifier` are not
    sufficient to isolate a specific book. To circumvent this
    limitation, we require another identifier called the
    `item_bookpath` which acts as a prefix to the files of a specific
    book. Given a datanode and an `item_dir` of an Archive Book Item,
    all the constituent files for a book can be constructed using
    `item_identifier` and `item_bookpath` in the following ways:

    - There is a single global metadata manifest file for the entire
      Archive Item named `{item_identifier}_meta.xml`.
    - All of the other book specific files follow the form
      `{item_bookpath}_{file}`. e.g. `{item_bookpath}_abbyy.gz`
    """
    def __init__(self, item_dir, item_identifier, item_bookpath):
        self.item_dir = item_dir
        self.item_identifier = item_identifier
        self.item_bookpath = item_bookpath

        # Guarantee all input file exist
        # These members will be set as self.`name`_`ext`, e.g. self.meta_xml
        input_files = [
            # prefix, name, ext
            (item_identifier, 'meta', 'xml'),
            (item_bookpath, 'abbyy', 'gz'),
            (item_bookpath, 'scandata', 'xml'),
            (item_bookpath, 'jp2', 'zip')]
        for (subdir, name, ext) in input_files:
            dependency = os.path.abspath(
                os.path.join(item_dir, '%s_%s.%s' % (subdir, name, ext)))
            if not os.path.exists(dependency):
                self.logger.debug(
                    "Invalid path to %s.%s: %s" % (name, ext, dependency)
                )
                if name == "scandata":
                    sys.exit(ERR_MISSING_SCANDATA)
                raise OSError(
                    "Invalid path to %s.%s: %s" % (name, ext, dependency)
                )
            setattr(self, '%s_%s' % (name, ext), dependency)


[docs]class Ebook(ArchiveBookItem):
    """
    Ebook is a utility for generating epub3 files based on Archive.org items.
    Holds extracted information about a book & the ebooklib EPUB object.
    """
    DEFAULT_EPUBCHECK_LEVEL = 'warning'
    DEFAULT_ACE_LEVEL = 'minor'

    def __init__(
            self, item_dir, item_identifier, item_bookpath,
            debug=False, epubcheck=None, ace=None,
    ):

        self.logger = logging.getLogger(__name__)
        if debug:
            self.logger.addHandler(logging.StreamHandler())
            self.logger.setLevel(logging.DEBUG)

        # Initialize all the book's variables cleanly
        self.debug = debug
        self.epubcheck = epubcheck or (
            # If no epubcheck specified and we're in debug mode, run
            # --epubcheck warning
            self.DEFAULT_EPUBCHECK_LEVEL if self.debug else None)
        self.ace = ace or (
            # If no ace specified and we're in debug mode, run
            # --ace minor
            self.DEFAULT_ACE_LEVEL if self.debug else None)
        self.metadata = {}     # the book's metadata
        self.blocks = []       # all <blocks> with contents, attributes
        self.paragraphs = {}   # paragraph style info

        self.tmpdir = ''       # stores converted images & extracted zip files
        self.abbyy_file = ''   # the ABBYY XML file
        self.chapters = []     # holds each of the chapter (EpubHtml) objects
        self.progression = ''  # page direction
        self.firsts = {}       # all first lines per-page
        self.lasts = {}        # all last lines per-page
        self.pages = OrderedDict()    # page-by-page information from scandata
        self.chapter_no = 0    # current number of identified chapters

        # are there headers, footers, or page numbers?
        self.headers_present = False
        self.pagenums_found = False
        self.rpagenums_found = False
        self.table = False
        self.table_row = False
        self.table_cell = False

        self.book = epub.EpubBook()  # the book itself
        # ebooklib.epub doesn't clean up cleanly without reset,
        # causing problems on consecutive runs
        self.book.reset()
        self.verifier = EpubVerify(self.debug)

        # Choose the image processing library
        try:
            subprocess.run(
                ["kdu_compress", "-v"], stdout=subprocess.DEVNULL, check=True
            )
            self.image_processor = "kakadu"
        except (FileNotFoundError, subprocess.CalledProcessError) as e:
            self.image_processor = "pillow"
        self.logger.debug("Image processing with {}.".format(self.image_processor))

        super(Ebook, self).__init__(item_dir, item_identifier, item_bookpath)

[docs]    def load_scandata_pages(self):
        """
        Parse the page-by-page scandata file. This stores page size,
        right or left leaf, and page type (eg copyright, color card, etc).
        """
        parser = ScandataParser(
            self.scandata_xml,
            self.pages,
            debug=self.debug,
        )
        parser.parse_scandata()

[docs]    def create_accessibility_metadata(self):
        """ Set up accessibility metadata """
        ALT_TEXT_PRESENT = config.getboolean('Main', 'ALT_TEXT_PRESENT')
        IMAGES_PRESENT = config.getboolean('Main', 'IMAGES_PRESENT')
        OCR_GENERATED = config.getboolean('Main', 'OCR_GENERATED')
        TEXT_PRESENT = config.getboolean('Main', 'TEXT_PRESENT')

        summary = ''
        modes = []
        modes_sufficient = []
        features = ['printPageNumbers', 'tableOfContents', ]

        if OCR_GENERATED:
            summary += (
                'The publication was generated using automated character '
                'recognition, therefore it may not be an accurate rendition '
                'of the original text, and it may not offer the correct '
                'reading sequence.'
            )
        if IMAGES_PRESENT:
            modes.append('visual')
            if ALT_TEXT_PRESENT:
                features.append('alternativeText')
            else:
                summary += (
                    'This publication is missing meaningful alternative text.'
                )
        if TEXT_PRESENT:
            modes.append('textual')
            if IMAGES_PRESENT:
                modes_sufficient.append('textual,visual')
                if ALT_TEXT_PRESENT:
                    modes_sufficient.append('textual')
            else:
                modes_sufficient.append('textual')
        elif IMAGES_PRESENT and ALT_TEXT_PRESENT:
            modes_sufficient.append('textual,visual')
            modes_sufficient.append('visual')
        elif IMAGES_PRESENT:
            modes_sufficient.append('visual')
        if OCR_GENERATED:
            # these states will be true for any static content,  which we know
            # is guaranteed for OCR generated texts.
            hazards = [
                'noFlashingHazard',
                'noMotionSimulationHazard',
                'noSoundHazard',
            ]
            controls = [
                'fullKeyboardControl',
                'fullMouseControl',
                'fullSwitchControl',
                'fullTouchControl',
                'fullVoiceControl',
            ]

        if summary:
            summary += 'The publication otherwise meets WCAG 2.0 Level A.'
        else:
            summary = 'The publication meets WCAG 2.0 Level A.'

        # Add the metadata to the publication
        self.book.add_metadata(
            None,
            'meta',
            summary,
            OrderedDict([('property', 'schema:accessibilitySummary')])
        )
        for feature in features:
            self.book.add_metadata(
                None,
                'meta',
                feature,
                OrderedDict([('property', 'schema:accessibilityFeature')])
            )
        for mode in modes:
            self.book.add_metadata(
                None,
                'meta',
                mode,
                OrderedDict([('property', 'schema:accessMode')])
            )
        for mode_sufficient in modes_sufficient:
            self.book.add_metadata(
                None,
                'meta',
                mode_sufficient,
                OrderedDict([('property', 'schema:accessModeSufficient')])
            )
        if hazards:
            for hazard in hazards:
                self.book.add_metadata(
                    None,
                    'meta',
                    hazard,
                    OrderedDict([('property', 'schema:accessibilityHazard')])
                )
        if controls:
            for control in controls:
                self.book.add_metadata(
                    None,
                    'meta',
                    control,
                    OrderedDict([('property', 'schema:accessibilityControl')])
                )

[docs]    def extract_images(self):
        """
        Extracts all of the images for the text.

        For efficiency's sake, do these all at once. Memory & CPU will be at a
        higher premium than disk space, so unzip the entire scan file into temp
        directory, instead of extracting only the needed images.
        """
        # extract jp2 images into tmpdir
        try:
            with ZipFile(self.jp2_zip) as f:
                f.extractall(self.tmpdir)
        except BadZipFile as e:
            self.logger.error(
                "extraction problem with {}".format(self.jp2_zip)
            )
            raise BadZipFile

[docs]    def images_are_extracted(self):
        if '.zip' not in self.jp2_zip:
            raise ValueError('jp2 dir misconfiguration: not a .zip')
        extracted_dir, _ = os.path.splitext(os.path.join(self.tmpdir, os.path.basename(self.jp2_zip)))
        self.logger.debug(extracted_dir)
        return os.path.exists(extracted_dir)

[docs]    def get_cover_leaf(self):
        """
        Try to find a cover image. If nothing is tagged as 'Cover', use
        the first page tagged 'Title'. If nothing is tagged as
        'Title', either, use the first page tagged
        'Normal'. Self.pages is an OrderedDict so break as soon as you
        find something useful, and don't search the whole set of
        pages.
        """
        pages_iter = iter(self.pages)
        for p in pages_iter:
            if self.pages[p] == 'cover':
                cover_leaf = p
                break
            elif self.pages[p] == 'title':
                cover_leaf = p
                break
            elif self.pages[p] == 'normal':
                cover_leaf = p
                break
        try:
            return cover_leaf
        except NameError:
            e = "No pages in scandata marked as Cover, Title, or Normal"
            self.logger.error(e)
            raise RuntimeError(e)

[docs]    def extract_cover(self):
        """
        http://web.archive.org/web/20180416230000/https://www.safaribooksonline.com/blog/2009/11/20/best-practices-in-epub-cover-images/
        """
        if not self.images_are_extracted():
            raise RuntimeError(
                'extract_covers cannot be run before extract_images'
            )

        # pad out the filename to four digits
        cover_jp2 = "{tmp}/{item_bookpath}_jp2/{item_bookpath}_{num:0>4}.jp2".format(
            tmp=self.tmpdir, item_bookpath=self.item_bookpath, num=self.get_cover_leaf())
        cover_png = '{}/cover.png'.format(self.tmpdir)
        self.logger.debug("jp2: %s & png: %s" % (
            cover_jp2, cover_png))

        # convert the JP2K file into a usable format for the cover
        imageobj = ImageFactory(self.image_processor)
        try:
            imageobj.crop_image(
                cover_jp2, cover_png, resize=(800, 1200)
            )
        except RuntimeError as e:
            # for failed image creation, keep processing the epub
            self.logger.error(e)

        self.book.set_cover(
            'images/cover.png', open(cover_png, 'rb').read())
        cover = self.book.items[-1]
        self.logger.debug(cover)
        cover.add_link(
            href='style/style.css', rel='stylesheet', type='text/css')

[docs]    def image_dim(self, block):
        """
        Given a dict object containing the block info for an image, generate
        a tuple of its dimensions:
        (left, top, right, bottom)
        """
        left = int(block['style']['l'])
        top = int(block['style']['t'])
        right = int(block['style']['r'])
        bottom = int(block['style']['b'])

        return (left, top, right, bottom)

[docs]    def make_image(self, block):
        """
        Given a dict object containing the block info for an image, generate
        the image HTML
        """
        page_no = block['page_no']
        if page_no == 0:
            # The first page's image is made into the cover automatically
            return

        # pad out the filename to four digits
        origfile = '{dir}/{item_bookpath}_jp2/{item_bookpath}_{page:0>4}.jp2'.format(
            dir=self.tmpdir,
            item_bookpath=self.item_bookpath,
            page=page_no
        )
        if not os.path.isfile(origfile):
            return
        basefile = 'img_{:0>4}.png'.format(self.picnum)
        outfile = '{}/{}'.format(self.tmpdir, basefile)
        in_epub_imagefile = 'images/{}'.format(basefile)

        # get image dimensions from ABBYY block attributes
        # (left, top, right, bottom)
        box = self.image_dim(block)
        width = box[2] - box[0]
        height = box[3] - box[1]

        # some image processors also need the original page dimensions
        pagewidth = float(block['style']['pagewidth'])
        pageheight = float(block['style']['pageheight'])
        pagedim = (pagewidth, pageheight)

        # ignore if this image is entirely encapsulated in another image
        for each_pic in self.metadata['pics_by_page']:
            # Ignore if this is just the block itself
            if each_pic == block:
                continue
            new_box = self.image_dim(each_pic)
            if all(i >= j for i, j in zip(box, new_box)):
                return

        # make the image:
        imageobj = ImageFactory(self.image_processor)
        try:
            imageobj.crop_image(origfile, outfile, dim=box, pagedim=pagedim)
        except RuntimeError as e:
            # for failed image creation, keep processing the epub
            self.logger.error(e)
            return ''
        epubimage = epub.EpubImage()
        epubimage.file_name = in_epub_imagefile
        with open(outfile, 'rb') as f:
            epubimage.content = f.read()
        epubimage = self.book.add_item(epubimage)

        # to approximate original layout, set the image container width to
        # percentage of the page width
        container_w = (width / pagewidth) * 100
        content = u'''
        <div style="width: {c_w}%;">
        <img src="{src}" alt="Picture #{picnum}">
        </div>
        '''.format(
            c_w=container_w,
            src=in_epub_imagefile,
            picnum=self.picnum,
            w=width,
            h=height,)

        # increment the image number
        self.picnum += 1

        return content

[docs]    def make_chapter(self, heading):
        """
        Create a chapter section in an ebooklib.epub.
        """
        # If we haven't passed a heading, just use the inferred chapter number
        # which won't correspond to the original's chapters.
        if not heading:
            heading = "Chapter {}".format(self.chapter_no)

        # If the previous chapter's content is empty, merge the two
        # Use the first chapter's chapter name and number, which are
        # likely to have been set by scandata and not OCR'd text.
        if self.chapters and self.chapters[-1].content == u'':
            chapter = self.chapters[-1]
            chapter.content = u'<h2>{}</h2>'.format(heading)
        else:
            # Increment the chapter number before creating a new one
            self.chapter_no += 1

            # The epub library escapes the XML itself
            chapter = epub.EpubHtml(
                title=dirtify_xml(heading).replace("\n", " "),
                direction=self.progression,
                # pad out the filename to four digits
                file_name='chap_{:0>4}.xhtml'.format(self.chapter_no),
                lang='{}'.format(self.metadata['language'][0])
            )
            chapter.content = u''
            chapter.add_link(
                href='style/style.css', rel='stylesheet', type='text/css'
            )
            self.chapters.append(chapter)
            self.book.add_item(chapter)

        return chapter

[docs]    def identify_headers_footers_pagenos(self, placement):
        """
        Attempts to identify the presence of headers, footers, or page numbers

        1. Build a dict of first & last lines, indexed by page number.
        2. Try to identify headers and footers.

        Headers and footers can appear on every page, or on alternating pages
        (for example if one page has header of the title, the facing page
        might have the header of the chapter name).

        They may include a page number, or the page number might be a
        standalone header or footer.

        The presence of headers and footers in the document does not mean they
        appear on every page (for example, chapter openings or illustrated
        pages sometimes don't contain the header/footer, or contain a modified
        version, such as a standalone page number).

        Page numbers may be in Arabic or Roman numerals.

        This method does not attempt to look for all edge cases. For example,
        it will not find:
        - constantly varied headers, as in a dictionary
        - page numbers that don't steadily increase
        - page numbers misidentified in the OCR process, eg. IO2 for 102
        - page numbers with characters around them, eg. '~ 45 ~'
        """

        # running this on first lines or last lines?
        if placement == 'first':
            mylines = self.firsts
        else:
            mylines = self.lasts
        self.logger.debug("Looking for headers/footers: {}".format(placement))

        # Look for standalone strings of digits
        digits = re.compile(r'^\d+$')
        romans = re.compile(r'^[xicmlvd]+$')
        candidate_digits = []
        candidate_romans = []
        for block in self.blocks:
            if placement in block:
                line = block['text']
                ourpageno = block['page_no']
                mylines[ourpageno] = {'text': block['text']}
                pageno = digits.search(line)
                rpageno = romans.search(line, re.IGNORECASE)
                if rpageno:
                    # Is this a roman numeral?
                    try:
                        # The numeral.roman2int method is very permissive
                        # for archaic numeral forms, which is good.
                        num = roman2int(line)
                    except ValueError:
                        # not a roman numeral
                        pass
                    mylines[ourpageno]['ocr_roman'] = placement
                    candidate_romans.append(num)
                elif pageno:
                    mylines[ourpageno]['ocr_digits'] = placement
                    candidate_digits.append(int(line))

        # The algorithms to find false positives in page number candidates
        # are resource intensive, so this excludes anything where the candidate
        # numbers aren't monotonically increasing.
        if candidate_digits and is_increasing(candidate_digits):
            self.pagenums_found = True
            self.logger.debug("Page #s found: {}".format(candidate_digits))
        if candidate_romans and is_increasing(candidate_romans):
            self.rpagenums_found = True
            self.logger.debug("Roman #s found: {}".format(candidate_romans))

        # identify match ratio
        fuzz_consecutive = 0
        fuzz_alternating = 0
        for k, v in mylines.items():
            # Check to see if there's still one page forward
            if k + 1 in mylines:
                ratio_consecutive = fuzz.ratio(
                    v['text'],
                    mylines[k + 1]['text']
                )
                mylines[k]['ratio_consecutive'] = ratio_consecutive
                fuzz_consecutive += ratio_consecutive
            # Check to see if there's still two pages forward
            if k + 2 in mylines:
                ratio_alternating = fuzz.ratio(
                    v['text'],
                    mylines[k + 2]['text']
                )
                mylines[k]['ratio_alternating'] = ratio_alternating
                fuzz_alternating += ratio_alternating

        # occasional similar first/last lines might happen in all texts,
        # so only identify headers & footers if there are many of them
        HEADERS_PRESENT_THRESHOLD = int(
            config.get('Main', 'HEADERS_PRESENT_THRESHOLD')
        )
        if len(mylines) > 2:
            average_consecutive = fuzz_consecutive / (len(mylines) - 1)
            average_alternating = fuzz_alternating / (len(mylines) - 2)
            self.logger.debug("{}: consecutive fuzz avg.: {}".format(
                placement,
                average_consecutive
            ))
            self.logger.debug("{}: alternating fuzz avg.: {}".format(
                placement,
                average_alternating
            ))
            if average_consecutive > HEADERS_PRESENT_THRESHOLD:
                if placement == 'first':
                    self.headers_present = 'consecutive'
                else:
                    self.footers_present = 'consecutive'
                self.logger.debug(
                    "{} repeated, consecutive pages".format(placement)
                )
            elif average_alternating > HEADERS_PRESENT_THRESHOLD:
                if placement == 'first':
                    self.headers_present = 'alternating'
                else:
                    self.footers_present = 'alternating'
                self.logger.debug(
                    "{} repeated, alternating pages".format(placement)
                )

[docs]    def is_header_footer(self, block, placement):
        """
        Given a block and our identified text structure, return True if this
        block's text is a header, footer, or page number to be ignored, False
        otherwise.
        """
        THRESHOLD = int(
            config.get('Main', 'FUZZY_HEADER_THRESHOLD')
        )

        # running this on first lines or last lines?
        if placement == 'first':
            mylines = self.firsts
        else:
            mylines = self.lasts

        ourpageno = block['page_no']
        if ourpageno in mylines:
            if (
                self.rpagenums_found and
                'ocr_roman' in mylines[ourpageno] and
                mylines[ourpageno]['ocr_roman'] == placement
            ):
                # This is an identified roman numeral page number
                return True
            if (
                self.pagenums_found and
                'ocr_digits' in mylines[ourpageno] and
                mylines[ourpageno]['ocr_digits'] == placement
            ):
                # This is an identified page number
                self.logger.debug(
                    "identified page number: {}".format(block['text'])
                )
                return True
            if (
                self.headers_present == 'consecutive' and
                'ratio_consecutive' in mylines[ourpageno] and
                mylines[ourpageno]['ratio_consecutive'] >= THRESHOLD
            ):
                return True
            if (
                self.headers_present == 'alternating' and
                'ratio_alternating' in mylines[ourpageno] and
                mylines[ourpageno]['ratio_alternating'] >= THRESHOLD
            ):
                return True
        return False

[docs]    def set_metadata(self):
        """
        Set the metadata on the epub object
        """
        self.book.set_identifier(self.metadata['identifier'][0])
        for language in self.metadata['language']:
            self.book.set_language(language)
        for title in self.metadata['title']:
            self.book.set_title(title)
        if 'creator' in self.metadata:
            creator_uid = 'creator'
            for i, creator in enumerate(self.metadata['creator']):
                self.book.add_author(creator, uid=creator_uid)
                creator_uid = 'creator_{creator_uid}'.format(creator_uid=i)
        if 'description' in self.metadata:
            for description in self.metadata['description']:
                self.book.add_metadata('DC', 'description', description)
        if 'publisher' in self.metadata:
            for publisher in self.metadata['publisher']:
                self.book.add_metadata('DC', 'publisher', publisher)
        if 'identifier-access' in self.metadata:
            for identifier_access in self.metadata['identifier-access']:
                self.book.add_metadata(
                    'DC', 'identifier', 'Access URL: {}'.format(
                        identifier_access
                    )
                )
        if 'identifier-ark' in self.metadata:
            for identifier_ark in self.metadata['identifier-ark']:
                self.book.add_metadata(
                    'DC', 'identifier', 'urn:ark:{}'.format(identifier_ark)
                )
        if 'isbn' in self.metadata:
            for isbn in self.metadata['isbn']:
                self.book.add_metadata(
                    'DC', 'identifier', 'urn:isbn:{}'.format(isbn)
                )
        if 'oclc-id' in self.metadata:
            for oclc_id in self.metadata['oclc-id']:
                self.book.add_metadata(
                    'DC', 'identifier', 'urn:oclc:{}'.format(oclc_id)
                )
        if 'external-identifier' in self.metadata:
            for external_identifier in self.metadata['external-identifier']:
                self.book.add_metadata('DC', 'identifier', external_identifier)
        if 'related-external-id' in self.metadata:
            for related_external_id in self.metadata['related-external-id']:
                self.book.add_metadata('DC', 'identifier', related_external_id)
        if 'subject' in self.metadata:
            for subject in self.metadata['subject']:
                self.book.add_metadata('DC', 'subject', subject)
        if 'date' in self.metadata:
            for date in self.metadata['date']:
                self.book.add_metadata('DC', 'date', date)

[docs]    def craft_html(self):
        """
        Assembles the XHTML content.

        Create some minimal navigation:
        * Break sections at text elements marked role: heading
        * Break files at any headings with roleLevel: 1
        Imperfect, but better than having no navigation or monster files.

        Images will get alternative text of "Picture #" followed by an index
        number for this image. Barring real alternative text for
        true accessibility, this at least adds some identifying information.
        """

        # Default section to hold cover image plus all until the 1st heading
        if 'title' in self.metadata:
            heading = self.metadata['title'][0]
        else:
            heading = "Opening Section"
        self.picnum = 1
        blocks_index = -1
        self.last_row = False
        pagetype = ''
        prev_pagetype = ''

        # Look for headers and page numbers
        # FR10 has markup but isn't reliable so look there as well
        self.identify_headers_footers_pagenos('first')
        self.identify_headers_footers_pagenos('last')
        self.last_row = False
        self.last_cell = False

        # Make the initial chapter stub
        chapter = self.make_chapter(heading)
        endnotes = '<ul>'
        noteref = 1

        # Make a title page
        chapter.content += u'<h1 dir="ltr" class="center">{}</h1>'.format(
            heading
        )
        if 'title-alt-script' in self.metadata:
            for i in self.metadata['title-alt-script']:
                chapter.content += (
                    u'<p dir="auto" class="center bold big">{}</p>'
                ).format(i)
        if 'creator' in self.metadata:
            for i in self.metadata['creator']:
                chapter.content += (
                    u'<p dir="ltr" class="center bold">{}</p>'
                ).format(i)
        if 'creator-alt-script' in self.metadata:
            for i in self.metadata['creator-alt-script']:
                chapter.content += (
                    u'<p dir="auto" class="center bold">{}</p>'
                ).format(i)
        chapter.content += (
            '<div class="offset">'
            '<p dir="ltr">This book was produced in EPUB format by the '
            'Internet Archive.</p> '
            '<p dir="ltr">The book pages were scanned and converted to EPUB '
            'format automatically. This process relies on optical character '
            'recognition, and is somewhat susceptible to errors. The book may '
            'not offer the correct reading sequence, and there may be '
            'weird characters, non-words, and incorrect guesses at '
            'structure. Some page numbers and headers or footers may remain '
            'from the scanned page. The process which identifies images might '
            'have found stray marks on the page which are not actually images '
            'from the book. The hidden page numbering which may be available '
            'to your ereader corresponds to the numbered pages in the print '
            'edition, but is not an exact match;  page numbers will increment '
            'at the same rate as the corresponding print edition, but we may '
            'have started numbering before the print book\'s visible page '
            'numbers.  The Internet Archive is working to improve the '
            'scanning process and resulting books, but in the meantime, we '
            'hope that this book will be useful to you.</p> '
            '<p dir="ltr">The Internet Archive was founded in 1996 to build '
            'an Internet library and to promote universal access to all '
            'knowledge. The Archive\'s purposes include offering permanent '
            'access for researchers, historians, scholars, people with '
            'disabilities, and ' 'the general public to historical '
            'collections that exist in digital format. The Internet Archive '
            'includes texts, audio, moving images, '
            'and software as well as archived web pages, and provides '
            'specialized services for information access for the blind and '
            'other persons with disabilities.</p>'
            '<p>Created with abbyy2epub (v.%s)</p></div>'
        ) % __version__

        for block in self.blocks:
            blocks_index += 1

            # Skip pages that  we don't want to include
            if 'type' not in block:
                continue
            # Get the pageType from scandata
            if (
                'page_no' in block and
                block['page_no'] in self.pages
            ):
                prev_pagetype = pagetype
                pagetype = self.pages[block['page_no']]
            else:
                # Treat it as Normal if it's not set
                pagetype = 'Normal'
            if pagetype in skippable_pages:
                continue

            # set the block style, if there is one
            if (
                'style' in block and
                'fontstyle' in block['style']
            ):
                fclass = ''
                fontstyle = block['style']['fontstyle']
                fsize = fontstyle['fs']
                if 'italic' in fontstyle:
                    fclass += 'italic '
                if 'bold' in fontstyle:
                    fclass += 'bold '
                if 'Serif' in fontstyle['ff'] or 'Times' in fontstyle['ff']:
                    fclass += 'serif '
                elif 'Sans' in fontstyle['ff']:
                    fclass += 'sans '

                fstyling = (
                    'class="{fclass}" style="font-size: {fsize}pt"'
                ).format(
                    fclass=fclass,
                    fsize=fsize,
                )
            else:
                fstyling = ''

            # Make chapters for certain page types, for accessible navigation
            pagetypes = {
                'contents': 'Table of Contents',
                'contributions': 'Contributions',
                'copyright': 'Copyright Page',
                'glossary': 'Glossary',
                'index': 'Index',
                'introduction': 'Introduction',
                'preface': 'Preface',
                'reference': 'Reference',
                'title': 'Title Page',
            }
            if (pagetype in pagetypes and pagetype != prev_pagetype):
                chapter = self.make_chapter(pagetypes[pagetype])

            if block['type'] == 'Text':
                text = block['text']
                role = block['role']

                # This is the first text element on the page
                if 'first' in block:
                    # Look for headers and page numbers
                    if self.is_header_footer(block, 'first'):
                        self.logger.debug("Stripping header {}".format(text))
                        continue

                # Look for footers and page numbers
                if (
                    'last' in block and
                    self.is_header_footer(block, 'last')
                ):
                        self.logger.debug("Stripping footer {}".format(text))
                        continue
                if role == 'footnote':
                    # Footnote. Our ABBYY markup doesn't indicate references,
                    # so fake them, right above the bottom of the page so
                    # they'll be reachable by all adaptive tech & user agents.
                    # Place as endnotes to improve cross-ereader reachability.

                    chapter.content += (
                        u'<p class="small">'
                        u'<a epub:type="noteref" href="#n{page}_{ref}">'
                        u'Note {ref}</a></p>'
                    ).format(
                        page=block['page_no'],
                        ref=noteref,
                    )
                    # must use now deprecated "rearnote" instead of "endnote"
                    # for now; endnote support is limited. Change when more
                    # readers support endnote.
                    endnotes += (
                        u'<li><aside epub:type="rearnote" id="n{page}_{ref}">'
                        u'{text}</aside></li>'
                    ).format(
                        page=block['page_no'],
                        ref=noteref,
                        text=text,
                    )
                    noteref += 1
                elif role == 'tableCaption':
                    # It would be ideal to mark up table captions as <caption>
                    # within the associated table.  However, the ABBYY markup
                    # doesn't have a way to associate the caption with the
                    # specific table, and there's no way of knowing if the
                    # caption is for a table immediately following or
                    # immediately prior. Add a little styling to make it more
                    # obvious, and some accessibility helpers.
                    chapter.content += (
                        u'<p {style}><span class="sr-only">'
                        u'Table caption</span>{text}</p>'
                    ).format(
                        style=fstyling,
                        text=text,
                    )
                elif role == 'heading':
                    if int(block['heading']) > 1:
                        # Heading >1. Format as heading
                        # but don't make new chapter.
                        chapter.content += u'<h{lev}>{text}</h{lev}>'.format(
                            lev=block['heading'], text=text
                        )
                    else:
                        # attach any endnotes to the chapter.
                        if noteref > 1:
                            chapter.content += '<hr /><h2>Chapter Notes</h2>'
                            chapter.content += endnotes
                            chapter.content += '</ol>'
                            noteref = 1
                            endnotes = '<ol>'

                        # Heading 1. Begin the new chapter
                        chapter = self.make_chapter(text)
                        chapter.content = u'<h{lev}>{text}</h{lev}>'.format(
                            lev=block['heading'], text=text
                        )
                else:
                    # Regular or other text block. Add its heading to the
                    # chapter content. In theory a table of contents could get
                    # parsed for page numbers and turned into a hyperlinked
                    # nav toc pointing to page elements, but relying on headers
                    # is probably more reliable.
                    chapter.content += u'<p {style}>{text}</p>'.format(
                        style=fstyling,
                        text=text,
                    )
            elif block['type'] == 'Page':
                # Nest this conditional; we don't want to short circuit if no
                # pages_support. Check to make sure we're not just repeating
                # page breaks if the interstital content was omitted.
                if (
                    self.metadata['PAGES_SUPPORT'] and
                    not chapter.content.endswith('epub:type="pagebreak"/>')
                ):
                    chapter.content += ebooklib_utils.create_pagebreak(
                        str(block['text'])
                    )
            elif (
                block['type'] == 'Picture' and
                pagetype != 'Cover'
            ):
                # Image
                content = self.make_image(block)
                if content:
                    chapter.content += content
            elif (
                block['type'] == 'Separator' or
                block['type'] == 'SeparatorsBox'
            ):
                # Separator blocks seem to be fairly randomly applied and don't
                # correspond to anything useful in the original content
                pass
            elif block['type'] == 'Table':
                chapter.content += u'<table>'
            elif block['type'] == 'TableRow':
                chapter.content += u'<tr>'
                if 'last_table_elem' in block:
                    self.last_row = True
            elif block['type'] == 'TableCell':
                chapter.content += u'<td>'
                if 'last_table_elem' in block:
                    self.last_cell = True
            elif block['type'] == 'TableText':
                chapter.content += u'<p {style}>{text}</p>'.format(
                    style=fstyling,
                    text=block['text'],
                )
                if 'last_table_elem' in block:
                    chapter.content += u'</td>'
                    if self.last_cell:
                        chapter.content + u'</tr>'
                        self.last_cell = False
                        if self.last_row:
                            chapter.content += u'</table>'
                            self.last_row = False
            else:
                self.logger.debug(
                    "Ignoring Block:\n Type: {}\n Attribs: {}".format(
                        block['type'], block['style']
                    )
                )

[docs]    def craft_epub(self, epub_outfile="out.epub", tmpdir=None):
        """ Assemble the extracted metadata & text into an EPUB  """

        # Even if we clean up properly afterwards, using TemporaryDirectory
        # outside of a context manager seems to cause a resource leak
        if tmpdir:
            tmpdir = os.path.abspath(tmpdir)
            os.makedirs(tmpdir, exist_ok=True)
        with tempfile.TemporaryDirectory(dir=tmpdir) as self.tmpdir:
            self.abbyy_file = "{tmp}/{base}_abbyy".format(
                tmp=self.tmpdir, base=self.item_identifier
            )
            self.logger.debug("Temp directory: {}\nidentifier: {}".format(
                self.tmpdir, self.item_identifier))
            # Unzip ABBYY file to disk. (Might be too huge to hold in memory.)
            with gzip.open(self.abbyy_gz, 'rb') as infile:
                with open(self.abbyy_file, 'wb') as outfile:
                    self.logger.debug(
                        "Abbyy tmp dir: {}".format(self.abbyy_file)
                    )
                    for line in infile:
                        outfile.write(line)

            # read in the page-by-page scandata file
            self.load_scandata_pages()

            # Extract the page images and create the cover file
            self.extract_images()
            self.extract_cover()

            # parse the ABBYY
            parser = AbbyyParser(
                self.abbyy_file,
                self.meta_xml,
                self.metadata,
                self.paragraphs,
                self.blocks,
                debug=self.debug,
            )
            parser.parse_abbyy()
            self.logger.debug("Done with parse_abbyy")

            # Text direction: convert IA abbreviation to epub abbreviation
            direction = {
                'lr': 'ltr',
                'rl': 'rtl',
            }

            if 'page-progression' in self.metadata:
                self.progression = direction[
                    self.metadata['page-progression'][0]
                ]
                self.book.set_direction(self.progression)
            else:
                # The epub, used in the spine, uses 'default' for unspecified
                # direction. HTML, used in the content pages, uses 'auto'.
                self.progression = 'auto'
                self.book.set_direction('default')

            # get the finereader version
            if 'fr-version' in self.metadata:
                self.version = self.metadata['fr-version']

            # make the HTML chapters
            self.logger.debug("craft_html")
            self.craft_html()
            self.logger.debug("Done assembling the HTML")

            # Set the book's metadata
            self.set_metadata()

            # set the accessibility metadata
            self.create_accessibility_metadata()

            # Navigation for EPUB 3 & EPUB 2 fallback
            self.book.toc = self.chapters
            self.book.add_item(epub.EpubNcx())
            self.book.add_item(epub.EpubNav())
            # cover_ncx hack to work around Adobe Digital Editions problem
            self.book.spine = ['cover', 'nav', ] + self.chapters

            # define CSS style
            style = """
                .center {text-align: center}
                .sr-only {
                    width: 1px;
                    height: 1px;
                    padding: 0;
                    margin: -1px;
                    overflow: hidden;
                    clip: rect(0,0,0,0);
                    border: 0;
                }
                .strong {font-weight: bold;}
                .italic {font-style: italic;}
                .serif {font-family: serif;}
                .sans {font-family: sans-serif;}
                .big {font-size: 1.5em;}
                .small {font-size: .75em;}
                .offset {
                    margin: 1em;
                    padding: 1.5em;
                    border: black 1px solid;
                }
                img {
                    padding: 0;
                    margin: 0;
                    max-width: 100%;
                    max-height: 100%;
                    column-count: 1;
                    break-inside: avoid;
                    oeb-column-number: 1;
                }
                """

            css_file = epub.EpubItem(
                uid="style_nav",
                file_name="style/style.css",
                media_type="text/css",
                content=style
            )
            self.book.add_item(css_file)

            if epub_outfile.endswith('.epub'):
                epub_outfile = epub_outfile
            else:
                epub_outfile = '%s.epub' % epub_outfile
            epub.write_epub(epub_outfile, self.book, {})

            # run validation on epub
            if self.debug or self.epubcheck:
                self.validate_epub(epub_outfile, level=self.epubcheck)
            if self.debug or self.ace:
                self.validate_a11y(epub_outfile, level=self.ace)

[docs]    def validate_epub(self, epub_file, level=None):
        self.logger.debug("Running EpubCheck on {}".format(epub_file))
        LEVELS = ['warning', 'error', 'fatal']
        level = level.lower() or self.DEFAULT_EPUBCHECK_LEVEL
        try:
            desired_levels = LEVELS[LEVELS.index(level):]
        except ValueError:
            self.logger.error(
                "Invalid --epubcheck level: `%s`.\n"
                "Falling back to default: `%s`" % (
                    level, self.DEFAULT_EPUBCHECK_LEVEL))
            desired_levels = LEVELS

        result = self.verifier.run_epubcheck(epub_file)
        errors = [err for err in result.messages if
                  # only keep desired_levels
                  err.level.lower() in desired_levels]
        if errors:
            raise RuntimeError(errors)

[docs]    def validate_a11y(self, epub_file, level=None):
        """
        Individual test failures are logged in EARL syntax
        https://daisy.github.io/ace/docs/report-json/
        Structurally:
        "assertions": [
            {
              "@type": "earl:assertion",
              "earl:result": {
                "earl:outcome": "fail"
              },
              "assertions": [
                {
                  "@type": "earl:assertion",
                  "earl:result": {
                    "earl:outcome": "fail",
                    "html": "[The invalid HTML]"
                  },
                  "earl:test": {
                    "earl:impact": "serious",
                    "help": {
                      "dct:description": "[Plain language error]"
                    },
                  }
                }
              ],
              "earl:testSubject": {
                "url": "cover.xhtml",
              },
            },
        ]
        """
        self.logger.debug("Running DAISY Ace on {}".format(epub_file))
        LEVELS = ['minor', 'moderate', 'serious', 'critical']
        level = level.lower() or self.DEFAULT_ACE_LEVEL
        try:
            desired_levels = LEVELS[LEVELS.index(level):]
        except ValueError:
            self.logger.error(
                "Invalid --ace level: `%s`.\n"
                "Falling back to default: `%s`" % (
                    level, self.DEFAULT_ACE_LEVEL))
            desired_levels = LEVELS

        result, error = self.verifier.run_ace(epub_file, self.tmpdir)

        # Build a list of errors, with the most important fields
        errors = list()
        if 'assertions' in result:
            for assertion in result['assertions']:
                if (
                    assertion['@type'] == "earl:assertion" and
                    assertion['earl:result']['earl:outcome'] == "fail"
                ):
                    test_subject = assertion['earl:testSubject']['url']
                    for each in assertion['assertions']:
                        err_level = each['earl:test']['earl:impact']
                        if err_level.lower() in desired_levels:
                            each['earl:testSubject'] = test_subject
                            errors.append(each)
        if errors:
            raise RuntimeError(errors)