# Copyright 2017 Deborah Kaplan
#
# This file is part of Abbyy-to-epub3.
# Source code is available at <https://github.com/deborahgu/abbyy-to-epub3>.
#
# Abbyy-to-epub3 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
from collections import OrderedDict
from ebooklib import epub
from ebooklib import utils as ebooklib_utils
from fuzzywuzzy import fuzz
from numeral import roman2int
from pkg_resources import resource_filename
from zipfile import BadZipFile, ZipFile
import configparser
import gzip
import logging
import os
import sys
import re
import subprocess
import tempfile
from abbyy_to_epub3 import __version__
from abbyy_to_epub3.constants import skippable_pages
from abbyy_to_epub3.parse_abbyy import AbbyyParser
from abbyy_to_epub3.image_processing import factory as ImageFactory
from abbyy_to_epub3.parse_scandata import ScandataParser
from abbyy_to_epub3.utils import dirtify_xml, is_increasing
from abbyy_to_epub3.verify_epub import EpubVerify
# Set up configuration
config = configparser.ConfigParser()
configfile = resource_filename("abbyy_to_epub3", "config.ini")
config.read(configfile)
ERR_MISSING_SCANDATA = 3
[docs]class ArchiveBookItem(object):
"""Archive.org is a website which contains an archive of items
composed of archived digital content. Archive.org items are
distributed across a cluster of machines called datanodes. In
order to access the files of an item, you need to know 4 things:
a) The Archive.org `item_identifier` (the unique ID of this item)
e.g. https://archive.org/details/{item_identifier}
b) the datanode server address which hosts this item
c) the `item_dir` which is the file path on this datanode where
this items files are kept
d) the name of the files within this `item_dir`
Certain archive.org items are specifically structured (file
organizations, contents, names) to store and play Books. Every
Archive Book Item contains the following files:
- a jp2.zip containing all the scanned images of the book
- an abbyy file containing the OCR'd plaintest of these scans
- scandata.xml whose metadata describes the structure of the book
(metadata, pages numbers)
- meta.xml which describes the entire archive.org *item*
A complication is that Archive.org Book Items may contain 1 or
more books. In order to accommodate this subtlety and delineate
between books, an `item_dir` and `item_identifier` are not
sufficient to isolate a specific book. To circumvent this
limitation, we require another identifier called the
`item_bookpath` which acts as a prefix to the files of a specific
book. Given a datanode and an `item_dir` of an Archive Book Item,
all the constituent files for a book can be constructed using
`item_identifier` and `item_bookpath` in the following ways:
- There is a single global metadata manifest file for the entire
Archive Item named `{item_identifier}_meta.xml`.
- All of the other book specific files follow the form
`{item_bookpath}_{file}`. e.g. `{item_bookpath}_abbyy.gz`
"""
def __init__(self, item_dir, item_identifier, item_bookpath):
self.item_dir = item_dir
self.item_identifier = item_identifier
self.item_bookpath = item_bookpath
# Guarantee all input file exist
# These members will be set as self.`name`_`ext`, e.g. self.meta_xml
input_files = [
# prefix, name, ext
(item_identifier, 'meta', 'xml'),
(item_bookpath, 'abbyy', 'gz'),
(item_bookpath, 'scandata', 'xml'),
(item_bookpath, 'jp2', 'zip')]
for (subdir, name, ext) in input_files:
dependency = os.path.abspath(
os.path.join(item_dir, '%s_%s.%s' % (subdir, name, ext)))
if not os.path.exists(dependency):
self.logger.debug(
"Invalid path to %s.%s: %s" % (name, ext, dependency)
)
if name == "scandata":
sys.exit(ERR_MISSING_SCANDATA)
raise OSError(
"Invalid path to %s.%s: %s" % (name, ext, dependency)
)
setattr(self, '%s_%s' % (name, ext), dependency)
[docs]class Ebook(ArchiveBookItem):
"""
Ebook is a utility for generating epub3 files based on Archive.org items.
Holds extracted information about a book & the ebooklib EPUB object.
"""
DEFAULT_EPUBCHECK_LEVEL = 'warning'
DEFAULT_ACE_LEVEL = 'minor'
def __init__(
self, item_dir, item_identifier, item_bookpath,
debug=False, epubcheck=None, ace=None,
):
self.logger = logging.getLogger(__name__)
if debug:
self.logger.addHandler(logging.StreamHandler())
self.logger.setLevel(logging.DEBUG)
# Initialize all the book's variables cleanly
self.debug = debug
self.epubcheck = epubcheck or (
# If no epubcheck specified and we're in debug mode, run
# --epubcheck warning
self.DEFAULT_EPUBCHECK_LEVEL if self.debug else None)
self.ace = ace or (
# If no ace specified and we're in debug mode, run
# --ace minor
self.DEFAULT_ACE_LEVEL if self.debug else None)
self.metadata = {} # the book's metadata
self.blocks = [] # all <blocks> with contents, attributes
self.paragraphs = {} # paragraph style info
self.tmpdir = '' # stores converted images & extracted zip files
self.abbyy_file = '' # the ABBYY XML file
self.chapters = [] # holds each of the chapter (EpubHtml) objects
self.progression = '' # page direction
self.firsts = {} # all first lines per-page
self.lasts = {} # all last lines per-page
self.pages = OrderedDict() # page-by-page information from scandata
self.chapter_no = 0 # current number of identified chapters
# are there headers, footers, or page numbers?
self.headers_present = False
self.pagenums_found = False
self.rpagenums_found = False
self.table = False
self.table_row = False
self.table_cell = False
self.book = epub.EpubBook() # the book itself
# ebooklib.epub doesn't clean up cleanly without reset,
# causing problems on consecutive runs
self.book.reset()
self.verifier = EpubVerify(self.debug)
# Choose the image processing library
try:
subprocess.run(
["kdu_compress", "-v"], stdout=subprocess.DEVNULL, check=True
)
self.image_processor = "kakadu"
except (FileNotFoundError, subprocess.CalledProcessError) as e:
self.image_processor = "pillow"
self.logger.debug("Image processing with {}.".format(self.image_processor))
super(Ebook, self).__init__(item_dir, item_identifier, item_bookpath)
[docs] def load_scandata_pages(self):
"""
Parse the page-by-page scandata file. This stores page size,
right or left leaf, and page type (eg copyright, color card, etc).
"""
parser = ScandataParser(
self.scandata_xml,
self.pages,
debug=self.debug,
)
parser.parse_scandata()
[docs] def get_cover_leaf(self):
"""
Try to find a cover image. If nothing is tagged as 'Cover', use
the first page tagged 'Title'. If nothing is tagged as
'Title', either, use the first page tagged
'Normal'. Self.pages is an OrderedDict so break as soon as you
find something useful, and don't search the whole set of
pages.
"""
pages_iter = iter(self.pages)
for p in pages_iter:
if self.pages[p] == 'cover':
cover_leaf = p
break
elif self.pages[p] == 'title':
cover_leaf = p
break
elif self.pages[p] == 'normal':
cover_leaf = p
break
try:
return cover_leaf
except NameError:
e = "No pages in scandata marked as Cover, Title, or Normal"
self.logger.error(e)
raise RuntimeError(e)
[docs] def image_dim(self, block):
"""
Given a dict object containing the block info for an image, generate
a tuple of its dimensions:
(left, top, right, bottom)
"""
left = int(block['style']['l'])
top = int(block['style']['t'])
right = int(block['style']['r'])
bottom = int(block['style']['b'])
return (left, top, right, bottom)
[docs] def make_image(self, block):
"""
Given a dict object containing the block info for an image, generate
the image HTML
"""
page_no = block['page_no']
if page_no == 0:
# The first page's image is made into the cover automatically
return
# pad out the filename to four digits
origfile = '{dir}/{item_bookpath}_jp2/{item_bookpath}_{page:0>4}.jp2'.format(
dir=self.tmpdir,
item_bookpath=self.item_bookpath,
page=page_no
)
if not os.path.isfile(origfile):
return
basefile = 'img_{:0>4}.png'.format(self.picnum)
outfile = '{}/{}'.format(self.tmpdir, basefile)
in_epub_imagefile = 'images/{}'.format(basefile)
# get image dimensions from ABBYY block attributes
# (left, top, right, bottom)
box = self.image_dim(block)
width = box[2] - box[0]
height = box[3] - box[1]
# some image processors also need the original page dimensions
pagewidth = float(block['style']['pagewidth'])
pageheight = float(block['style']['pageheight'])
pagedim = (pagewidth, pageheight)
# ignore if this image is entirely encapsulated in another image
for each_pic in self.metadata['pics_by_page']:
# Ignore if this is just the block itself
if each_pic == block:
continue
new_box = self.image_dim(each_pic)
if all(i >= j for i, j in zip(box, new_box)):
return
# make the image:
imageobj = ImageFactory(self.image_processor)
try:
imageobj.crop_image(origfile, outfile, dim=box, pagedim=pagedim)
except RuntimeError as e:
# for failed image creation, keep processing the epub
self.logger.error(e)
return ''
epubimage = epub.EpubImage()
epubimage.file_name = in_epub_imagefile
with open(outfile, 'rb') as f:
epubimage.content = f.read()
epubimage = self.book.add_item(epubimage)
# to approximate original layout, set the image container width to
# percentage of the page width
container_w = (width / pagewidth) * 100
content = u'''
<div style="width: {c_w}%;">
<img src="{src}" alt="Picture #{picnum}">
</div>
'''.format(
c_w=container_w,
src=in_epub_imagefile,
picnum=self.picnum,
w=width,
h=height,)
# increment the image number
self.picnum += 1
return content
[docs] def make_chapter(self, heading):
"""
Create a chapter section in an ebooklib.epub.
"""
# If we haven't passed a heading, just use the inferred chapter number
# which won't correspond to the original's chapters.
if not heading:
heading = "Chapter {}".format(self.chapter_no)
# If the previous chapter's content is empty, merge the two
# Use the first chapter's chapter name and number, which are
# likely to have been set by scandata and not OCR'd text.
if self.chapters and self.chapters[-1].content == u'':
chapter = self.chapters[-1]
chapter.content = u'<h2>{}</h2>'.format(heading)
else:
# Increment the chapter number before creating a new one
self.chapter_no += 1
# The epub library escapes the XML itself
chapter = epub.EpubHtml(
title=dirtify_xml(heading).replace("\n", " "),
direction=self.progression,
# pad out the filename to four digits
file_name='chap_{:0>4}.xhtml'.format(self.chapter_no),
lang='{}'.format(self.metadata['language'][0])
)
chapter.content = u''
chapter.add_link(
href='style/style.css', rel='stylesheet', type='text/css'
)
self.chapters.append(chapter)
self.book.add_item(chapter)
return chapter
[docs] def craft_html(self):
"""
Assembles the XHTML content.
Create some minimal navigation:
* Break sections at text elements marked role: heading
* Break files at any headings with roleLevel: 1
Imperfect, but better than having no navigation or monster files.
Images will get alternative text of "Picture #" followed by an index
number for this image. Barring real alternative text for
true accessibility, this at least adds some identifying information.
"""
# Default section to hold cover image plus all until the 1st heading
if 'title' in self.metadata:
heading = self.metadata['title'][0]
else:
heading = "Opening Section"
self.picnum = 1
blocks_index = -1
self.last_row = False
pagetype = ''
prev_pagetype = ''
# Look for headers and page numbers
# FR10 has markup but isn't reliable so look there as well
self.identify_headers_footers_pagenos('first')
self.identify_headers_footers_pagenos('last')
self.last_row = False
self.last_cell = False
# Make the initial chapter stub
chapter = self.make_chapter(heading)
endnotes = '<ul>'
noteref = 1
# Make a title page
chapter.content += u'<h1 dir="ltr" class="center">{}</h1>'.format(
heading
)
if 'title-alt-script' in self.metadata:
for i in self.metadata['title-alt-script']:
chapter.content += (
u'<p dir="auto" class="center bold big">{}</p>'
).format(i)
if 'creator' in self.metadata:
for i in self.metadata['creator']:
chapter.content += (
u'<p dir="ltr" class="center bold">{}</p>'
).format(i)
if 'creator-alt-script' in self.metadata:
for i in self.metadata['creator-alt-script']:
chapter.content += (
u'<p dir="auto" class="center bold">{}</p>'
).format(i)
chapter.content += (
'<div class="offset">'
'<p dir="ltr">This book was produced in EPUB format by the '
'Internet Archive.</p> '
'<p dir="ltr">The book pages were scanned and converted to EPUB '
'format automatically. This process relies on optical character '
'recognition, and is somewhat susceptible to errors. The book may '
'not offer the correct reading sequence, and there may be '
'weird characters, non-words, and incorrect guesses at '
'structure. Some page numbers and headers or footers may remain '
'from the scanned page. The process which identifies images might '
'have found stray marks on the page which are not actually images '
'from the book. The hidden page numbering which may be available '
'to your ereader corresponds to the numbered pages in the print '
'edition, but is not an exact match; page numbers will increment '
'at the same rate as the corresponding print edition, but we may '
'have started numbering before the print book\'s visible page '
'numbers. The Internet Archive is working to improve the '
'scanning process and resulting books, but in the meantime, we '
'hope that this book will be useful to you.</p> '
'<p dir="ltr">The Internet Archive was founded in 1996 to build '
'an Internet library and to promote universal access to all '
'knowledge. The Archive\'s purposes include offering permanent '
'access for researchers, historians, scholars, people with '
'disabilities, and ' 'the general public to historical '
'collections that exist in digital format. The Internet Archive '
'includes texts, audio, moving images, '
'and software as well as archived web pages, and provides '
'specialized services for information access for the blind and '
'other persons with disabilities.</p>'
'<p>Created with abbyy2epub (v.%s)</p></div>'
) % __version__
for block in self.blocks:
blocks_index += 1
# Skip pages that we don't want to include
if 'type' not in block:
continue
# Get the pageType from scandata
if (
'page_no' in block and
block['page_no'] in self.pages
):
prev_pagetype = pagetype
pagetype = self.pages[block['page_no']]
else:
# Treat it as Normal if it's not set
pagetype = 'Normal'
if pagetype in skippable_pages:
continue
# set the block style, if there is one
if (
'style' in block and
'fontstyle' in block['style']
):
fclass = ''
fontstyle = block['style']['fontstyle']
fsize = fontstyle['fs']
if 'italic' in fontstyle:
fclass += 'italic '
if 'bold' in fontstyle:
fclass += 'bold '
if 'Serif' in fontstyle['ff'] or 'Times' in fontstyle['ff']:
fclass += 'serif '
elif 'Sans' in fontstyle['ff']:
fclass += 'sans '
fstyling = (
'class="{fclass}" style="font-size: {fsize}pt"'
).format(
fclass=fclass,
fsize=fsize,
)
else:
fstyling = ''
# Make chapters for certain page types, for accessible navigation
pagetypes = {
'contents': 'Table of Contents',
'contributions': 'Contributions',
'copyright': 'Copyright Page',
'glossary': 'Glossary',
'index': 'Index',
'introduction': 'Introduction',
'preface': 'Preface',
'reference': 'Reference',
'title': 'Title Page',
}
if (pagetype in pagetypes and pagetype != prev_pagetype):
chapter = self.make_chapter(pagetypes[pagetype])
if block['type'] == 'Text':
text = block['text']
role = block['role']
# This is the first text element on the page
if 'first' in block:
# Look for headers and page numbers
if self.is_header_footer(block, 'first'):
self.logger.debug("Stripping header {}".format(text))
continue
# Look for footers and page numbers
if (
'last' in block and
self.is_header_footer(block, 'last')
):
self.logger.debug("Stripping footer {}".format(text))
continue
if role == 'footnote':
# Footnote. Our ABBYY markup doesn't indicate references,
# so fake them, right above the bottom of the page so
# they'll be reachable by all adaptive tech & user agents.
# Place as endnotes to improve cross-ereader reachability.
chapter.content += (
u'<p class="small">'
u'<a epub:type="noteref" href="#n{page}_{ref}">'
u'Note {ref}</a></p>'
).format(
page=block['page_no'],
ref=noteref,
)
# must use now deprecated "rearnote" instead of "endnote"
# for now; endnote support is limited. Change when more
# readers support endnote.
endnotes += (
u'<li><aside epub:type="rearnote" id="n{page}_{ref}">'
u'{text}</aside></li>'
).format(
page=block['page_no'],
ref=noteref,
text=text,
)
noteref += 1
elif role == 'tableCaption':
# It would be ideal to mark up table captions as <caption>
# within the associated table. However, the ABBYY markup
# doesn't have a way to associate the caption with the
# specific table, and there's no way of knowing if the
# caption is for a table immediately following or
# immediately prior. Add a little styling to make it more
# obvious, and some accessibility helpers.
chapter.content += (
u'<p {style}><span class="sr-only">'
u'Table caption</span>{text}</p>'
).format(
style=fstyling,
text=text,
)
elif role == 'heading':
if int(block['heading']) > 1:
# Heading >1. Format as heading
# but don't make new chapter.
chapter.content += u'<h{lev}>{text}</h{lev}>'.format(
lev=block['heading'], text=text
)
else:
# attach any endnotes to the chapter.
if noteref > 1:
chapter.content += '<hr /><h2>Chapter Notes</h2>'
chapter.content += endnotes
chapter.content += '</ol>'
noteref = 1
endnotes = '<ol>'
# Heading 1. Begin the new chapter
chapter = self.make_chapter(text)
chapter.content = u'<h{lev}>{text}</h{lev}>'.format(
lev=block['heading'], text=text
)
else:
# Regular or other text block. Add its heading to the
# chapter content. In theory a table of contents could get
# parsed for page numbers and turned into a hyperlinked
# nav toc pointing to page elements, but relying on headers
# is probably more reliable.
chapter.content += u'<p {style}>{text}</p>'.format(
style=fstyling,
text=text,
)
elif block['type'] == 'Page':
# Nest this conditional; we don't want to short circuit if no
# pages_support. Check to make sure we're not just repeating
# page breaks if the interstital content was omitted.
if (
self.metadata['PAGES_SUPPORT'] and
not chapter.content.endswith('epub:type="pagebreak"/>')
):
chapter.content += ebooklib_utils.create_pagebreak(
str(block['text'])
)
elif (
block['type'] == 'Picture' and
pagetype != 'Cover'
):
# Image
content = self.make_image(block)
if content:
chapter.content += content
elif (
block['type'] == 'Separator' or
block['type'] == 'SeparatorsBox'
):
# Separator blocks seem to be fairly randomly applied and don't
# correspond to anything useful in the original content
pass
elif block['type'] == 'Table':
chapter.content += u'<table>'
elif block['type'] == 'TableRow':
chapter.content += u'<tr>'
if 'last_table_elem' in block:
self.last_row = True
elif block['type'] == 'TableCell':
chapter.content += u'<td>'
if 'last_table_elem' in block:
self.last_cell = True
elif block['type'] == 'TableText':
chapter.content += u'<p {style}>{text}</p>'.format(
style=fstyling,
text=block['text'],
)
if 'last_table_elem' in block:
chapter.content += u'</td>'
if self.last_cell:
chapter.content + u'</tr>'
self.last_cell = False
if self.last_row:
chapter.content += u'</table>'
self.last_row = False
else:
self.logger.debug(
"Ignoring Block:\n Type: {}\n Attribs: {}".format(
block['type'], block['style']
)
)
[docs] def craft_epub(self, epub_outfile="out.epub", tmpdir=None):
""" Assemble the extracted metadata & text into an EPUB """
# Even if we clean up properly afterwards, using TemporaryDirectory
# outside of a context manager seems to cause a resource leak
if tmpdir:
tmpdir = os.path.abspath(tmpdir)
os.makedirs(tmpdir, exist_ok=True)
with tempfile.TemporaryDirectory(dir=tmpdir) as self.tmpdir:
self.abbyy_file = "{tmp}/{base}_abbyy".format(
tmp=self.tmpdir, base=self.item_identifier
)
self.logger.debug("Temp directory: {}\nidentifier: {}".format(
self.tmpdir, self.item_identifier))
# Unzip ABBYY file to disk. (Might be too huge to hold in memory.)
with gzip.open(self.abbyy_gz, 'rb') as infile:
with open(self.abbyy_file, 'wb') as outfile:
self.logger.debug(
"Abbyy tmp dir: {}".format(self.abbyy_file)
)
for line in infile:
outfile.write(line)
# read in the page-by-page scandata file
self.load_scandata_pages()
# Extract the page images and create the cover file
self.extract_images()
self.extract_cover()
# parse the ABBYY
parser = AbbyyParser(
self.abbyy_file,
self.meta_xml,
self.metadata,
self.paragraphs,
self.blocks,
debug=self.debug,
)
parser.parse_abbyy()
self.logger.debug("Done with parse_abbyy")
# Text direction: convert IA abbreviation to epub abbreviation
direction = {
'lr': 'ltr',
'rl': 'rtl',
}
if 'page-progression' in self.metadata:
self.progression = direction[
self.metadata['page-progression'][0]
]
self.book.set_direction(self.progression)
else:
# The epub, used in the spine, uses 'default' for unspecified
# direction. HTML, used in the content pages, uses 'auto'.
self.progression = 'auto'
self.book.set_direction('default')
# get the finereader version
if 'fr-version' in self.metadata:
self.version = self.metadata['fr-version']
# make the HTML chapters
self.logger.debug("craft_html")
self.craft_html()
self.logger.debug("Done assembling the HTML")
# Set the book's metadata
self.set_metadata()
# set the accessibility metadata
self.create_accessibility_metadata()
# Navigation for EPUB 3 & EPUB 2 fallback
self.book.toc = self.chapters
self.book.add_item(epub.EpubNcx())
self.book.add_item(epub.EpubNav())
# cover_ncx hack to work around Adobe Digital Editions problem
self.book.spine = ['cover', 'nav', ] + self.chapters
# define CSS style
style = """
.center {text-align: center}
.sr-only {
width: 1px;
height: 1px;
padding: 0;
margin: -1px;
overflow: hidden;
clip: rect(0,0,0,0);
border: 0;
}
.strong {font-weight: bold;}
.italic {font-style: italic;}
.serif {font-family: serif;}
.sans {font-family: sans-serif;}
.big {font-size: 1.5em;}
.small {font-size: .75em;}
.offset {
margin: 1em;
padding: 1.5em;
border: black 1px solid;
}
img {
padding: 0;
margin: 0;
max-width: 100%;
max-height: 100%;
column-count: 1;
break-inside: avoid;
oeb-column-number: 1;
}
"""
css_file = epub.EpubItem(
uid="style_nav",
file_name="style/style.css",
media_type="text/css",
content=style
)
self.book.add_item(css_file)
if epub_outfile.endswith('.epub'):
epub_outfile = epub_outfile
else:
epub_outfile = '%s.epub' % epub_outfile
epub.write_epub(epub_outfile, self.book, {})
# run validation on epub
if self.debug or self.epubcheck:
self.validate_epub(epub_outfile, level=self.epubcheck)
if self.debug or self.ace:
self.validate_a11y(epub_outfile, level=self.ace)
[docs] def validate_epub(self, epub_file, level=None):
self.logger.debug("Running EpubCheck on {}".format(epub_file))
LEVELS = ['warning', 'error', 'fatal']
level = level.lower() or self.DEFAULT_EPUBCHECK_LEVEL
try:
desired_levels = LEVELS[LEVELS.index(level):]
except ValueError:
self.logger.error(
"Invalid --epubcheck level: `%s`.\n"
"Falling back to default: `%s`" % (
level, self.DEFAULT_EPUBCHECK_LEVEL))
desired_levels = LEVELS
result = self.verifier.run_epubcheck(epub_file)
errors = [err for err in result.messages if
# only keep desired_levels
err.level.lower() in desired_levels]
if errors:
raise RuntimeError(errors)
[docs] def validate_a11y(self, epub_file, level=None):
"""
Individual test failures are logged in EARL syntax
https://daisy.github.io/ace/docs/report-json/
Structurally:
"assertions": [
{
"@type": "earl:assertion",
"earl:result": {
"earl:outcome": "fail"
},
"assertions": [
{
"@type": "earl:assertion",
"earl:result": {
"earl:outcome": "fail",
"html": "[The invalid HTML]"
},
"earl:test": {
"earl:impact": "serious",
"help": {
"dct:description": "[Plain language error]"
},
}
}
],
"earl:testSubject": {
"url": "cover.xhtml",
},
},
]
"""
self.logger.debug("Running DAISY Ace on {}".format(epub_file))
LEVELS = ['minor', 'moderate', 'serious', 'critical']
level = level.lower() or self.DEFAULT_ACE_LEVEL
try:
desired_levels = LEVELS[LEVELS.index(level):]
except ValueError:
self.logger.error(
"Invalid --ace level: `%s`.\n"
"Falling back to default: `%s`" % (
level, self.DEFAULT_ACE_LEVEL))
desired_levels = LEVELS
result, error = self.verifier.run_ace(epub_file, self.tmpdir)
# Build a list of errors, with the most important fields
errors = list()
if 'assertions' in result:
for assertion in result['assertions']:
if (
assertion['@type'] == "earl:assertion" and
assertion['earl:result']['earl:outcome'] == "fail"
):
test_subject = assertion['earl:testSubject']['url']
for each in assertion['assertions']:
err_level = each['earl:test']['earl:impact']
if err_level.lower() in desired_levels:
each['earl:testSubject'] = test_subject
errors.append(each)
if errors:
raise RuntimeError(errors)