"""Sample functions to use in ``process`` option.

Divided in three parts:

    * Helper Functions
    * General Functions (applicable for many websites)
    * Site Specific Functions

from copy import deepcopy
import logging
import re

from tosixinch import lxml_html
from tosixinch.clean import KEEP_STYLE  # noqa: F401

logger = logging.getLogger(__name__)

# lxml.html.defs.empty_tags (only for html4)
self_closing_tags = (
    'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
    'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
    'source', 'track', 'wbr',

fromstring = lambda el: lxml_html.fromstring(el)
tostring = lambda el: lxml_html.tostring(el, encoding='unicode')

# lxml.html module's parsing functions are
# rather complex wrappers of lxml.etree ones,
# mainly designed to parse broken htmls.
# They are useful, but in testing situations, a little confusing.
# I need a memorandum somewhere.
def _fromstring_examples():
    """lxml.html module's fromstring examples.

    >>> el = fromstring('<p>aaa</p>')
    >>> tostring(el)
    >>> tostring(el.getparent())
    >>> el = fromstring('<p>aaa</p><p>bbb</p>')
    >>> tostring(el)
    >>> tostring(el.getparent())
    >>> el = fromstring('<code>aaa</code><code>bbb</code>')
    >>> tostring(el)
    >>> el = fromstring('aaa<div>bbb</div>')
    >>> tostring(el)
    >>> el = fromstring('<body></body>')
    >>> tostring(el)

# ----------------------------------------------------------
# Helper Functions

[docs]def make_tag(tag='div', text=''): """Make element (``HtmlElement``) from tag and string. >>> el = make_tag('p', 'aaa') >>> tostring(el) '<p>aaa</p>' """ if tag in self_closing_tags: return fromstring('<%s %s />' % (tag, text)) return fromstring('<%s>%s</%s>' % (tag, text, tag))
[docs]def wrap_tag(el, tag='div'): """Wrap element in a tag. >>> el = fromstring('<p>aaa</p>') >>> parent = el.getparent() >>> wrap_tag(el, 'div') >>> tostring(parent[0]) '<div><p>aaa</p></div>' """ parent = el.getparent() assert parent is not None tag = make_tag(tag, '') tag.append(deepcopy(el)) replace_tag(el, tag)
[docs]def remove_tag(el): """Remove element (and subelements) from parent element. >>> doc = fromstring('<div><p>aaa</p><p>bbb</p></div>') >>> el = doc.xpath('//p')[1] >>> remove_tag(el) >>> tostring(doc) '<div><p>aaa</p></div>' """ el.drop_tree()
[docs]def replace_tag(el, replace): """Replace element to another element. >>> doc = fromstring('<div><p>aaa</p></div>') >>> el = doc.xpath('//p')[0] >>> repl = make_tag('h3', 'bbb') >>> replace_tag(el, repl) >>> tostring(doc) '<div><h3>bbb</h3></div>' """ parent = el.getparent() assert parent is not None parent.replace(el, replace)
[docs]def insert_tag(el, add, before=True): """Insert element ('add') before or after element ('el'). See `add_hr` for doctest example. """ parent = el.getparent() assert parent is not None num = parent.index(el) if not before: num = num + 1 parent.insert(num, add)
[docs]def check_parent_tag(el, tag='div', generation=2): """Check existance of tag in an element's parent elements. And returns it if found. >>> doc = fromstring('<table><tr><td>aaa</td></tr></table>') >>> el = doc.xpath('//td')[0] >>> el = check_parent_tag(el, 'table') >>> el.tag 'table' """ for i in range(int(generation)): el = el.getparent() if el is None: return if el.tag == tag: return el
# ``HtmlElement.text_content()``, or ```etree.tostring(el, method="text")`` # seems to do the same thing.
[docs]def get_element_text(el, path='.'): """Return all texts in an element or elements. :param el: main elemant to search :param path: xpath string for the element(s) you want >>> el = fromstring('<h2>aaa<div>bbb</div></h2>') >>> get_element_text(el, '//h2') 'aaabbb' >>> el = fromstring('<div>no<h2>aaa<div>bbb</div><div>ccc<p>ddd</p></div></h2><h2>xxx</h2></div>') # noqa: E501 >>> get_element_text(el, '//h2') 'aaabbbcccdddxxx' """ elements = el.xpath(path) if len(elements) == 1: return el.xpath('string(%s)' % path) text = [] for element in elements: text.append(get_element_text(element, '.')) return ''.join(text)
[docs]def get_metadata(el): """Get basic metadata from ``<meta name=... content=...>``.""" # Borrowing codes from: # root = el.getroottree() authors = [] description = None generator = None keywords = [] created = None modified = None for element in root.iter('meta'): name = element.get('name', '') content = element.get('content', '') if name == 'author': authors.append(content) elif name == 'description' and description is None: description = content elif name == 'generator' and generator is None: generator = content elif name == 'keywords': for keyword in content.split(','): keyword = keyword.strip() if keyword not in keywords: keywords.append(keyword) elif name == 'dcterms.created' and created is None: created = content elif name == 'dcterms.modified' and modified is None: modified = content return dict(authors=authors, description=description, generator=generator, keywords=keywords, created=created, modified=modified)
# ---------------------------------------------------------- # General Functions
[docs]def add_h1(doc, force=False): """If there is no ``<h1>``, make ``<h1>`` from ``<title>`` tag text. >>> s = '<html><head><title>aaa</title></head><body></body></html>' >>> doc = fromstring(s) >>> add_h1(doc) >>> tostring(doc) '<html><head><title>aaa</title></head><body><h1>aaa</h1></body></html>' """ if doc.xpath('//h1'): if force is False: return if not doc.xpath('//title/text()'): return text = doc.xpath('//title/text()')[0] el = make_tag('h1', text) doc.body.insert(0, el)
[docs]def add_h1_force(doc): """Add title even if there are ``<h1>`` s already.""" add_h1(doc, True)
[docs]def delete_duplicate_br(doc, maxnum=2): """Continuous ``<br>`` tags to maximum ``<br>``, to save display space. >>> el = fromstring('<div>aaa<br><br> <br><br/><br>bbb<br><br></div>') >>> delete_duplicate_br(el) >>> tostring(el) '<div>aaa<br><br> bbb<br><br></div>' """ num = 0 _remove = [] for el in doc.iter(): if el.tag == 'br': num += 1 if num > maxnum: _remove.append(el) if el.tail is None or el.tail.strip() == '': num = min(num, maxnum) else: num = 0 else: num = 0 for el in list(_remove): el.drop_tag()
[docs]def youtube_video_to_thumbnail(doc): """Change embeded youtube video object to thumbnail image. | from: ```` | to: ```` """ for el in doc.xpath('//iframe[contains(@src, "")]'): m = re.match( r'(?:https?:)?//www\.youtube\.com/embed/([^/?]+)', el.get("src")) if m: itagfmt = '<img src="">' itagstr = itagfmt % itag = fromstring(itagstr) replace_tag(el, itag)
[docs]def show_href(doc): r"""Make ``<a href=...>`` links to visible text. >>> el = fromstring('<div><a href="aaa">bbb</a></div>') >>> show_href(el) >>> tostring(el) '<div><a href="aaa">bbb</a><span class="tsi-href-visible">\xa0 [[aaa]] \xa0</span></div>' """ # noqa: E501 classname = 'tsi-href-visible' for el in doc.xpath('//a[@href]'): linkstr = '<span class="%s">&nbsp; [[%s]] &nbsp;</span>' % ( classname, el.get('href')) link = fromstring(linkstr) el.addnext(link)
[docs]def lower_heading(doc, path=None): """Decrease heading number except specified element (by xpath). That is, ``<h1>`` becomes ``<h2>``, ... ``<h5>`` becomes ``<h6>``. (``<h6>`` is kept as is). It is for prettier Table of Contents, TOC is usually copied from heading structure. A basic use case is when the document has multiple ``<h1>``. You don't want those to clutter TOC tree, want only one of them on top. >>> el = fromstring('<div><h1>aaa</h1><h1 class="b">bbb</h1><h2>ccc</h2></div>') # noqa: E501 >>> lower_heading(el, './@class="b"') >>> tostring(el) '<div><h2>aaa</h2><h1 class="b">bbb</h1><h3>ccc</h3></div>' """ for i in range(5, 0, -1): for el in doc.xpath('//h' + str(i)): el.tag = 'h' + str(i + 1) if not path: return for i in range(2, 7): for el in doc.xpath('//h' + str(i)): if el.xpath(path): el.tag = 'h1' return
[docs]def lower_heading_from_order(doc, tag=1, order=1): """Decrease heading number except specified element (by order). The purpose is the same as `lower_heading`, except you specify keep-element by heading number and order. So e.g. argument ``'tag=2, order=3'`` means third ``<h2>`` tag element in the document. >>> el = fromstring('<div><h1>aaa</h1><h1>bbb</h1><h2>ccc</h2></div>') >>> lower_heading_from_order(el, 1, 2) >>> tostring(el) '<div><h2>aaa</h2><h1>bbb</h1><h3>ccc</h3></div>' """ for i in range(5, 0, -1): for j, el in enumerate(doc.xpath('//h' + str(i))): if i == int(tag) and j + 1 == int(order): continue el.tag = 'h' + str(i + 1)
[docs]def lower_heading_from_order_auto(doc): """Lower headings, except first <h1>, if multiple h1 headings found.""" if len(doc.xpath('//h1')) > 1: lower_heading_from_order(doc)
[docs]def split_h1(doc, seps=None, part='1'): """Remove unwanted parts from h1 string. Headings or titles are often composed of multiple items, like 'Murder! - Domestic News - The Local Paper'. You want just 'Murder!'. Selected items are whitespace stripped. :param seps: strings by which heading is separated. if ``None``, default ``' - ', ' : ', ' | '`` is used. :param part: which part to select. '1' means first, or index 0. special number '-1' selects last item. >>> el = fromstring('<h1>aaa ~ bbb</h1>') >>> split_h1(el, '~', '2') >>> tostring(el) '<h1>bbb</h1>' >>> el = fromstring('<h1>aaa ~ bbb</h1>') >>> split_h1(el, '~', '-1') >>> tostring(el) '<h1>bbb</h1>' """ el = doc.xpath('//h1')[0] if seps is None: seps = [' - ', ' : ', ' | '] if isinstance(seps, str): seps = [seps] if part == '-1': part = '0' for sep in seps: if sep in el.text: el.text = el.text.split(sep)[int(part) - 1].strip() break
[docs]def replace_h1(el, pat, repl=''): """Change ``<h1>`` string by regular expression, ``pat`` to ``repl``. >>> el = fromstring('<h1>A boring article</h1>') >>> replace_h1(el, 'A boring', 'An exciting') >>> tostring(el) '<h1>An exciting article</h1>' """ for h in el.xpath('//h1'): h.text = re.sub(pat, repl, h.text)
[docs]def code_to_pre_code(doc): r"""Wrap ``<code>`` with ``<pre>``, when text includes newlines. Sample css adds thin border style to ``<pre>``, and not to ``<code>``, which is to make multiline code marked out a little, and inline code not looking cluttered, in small black and white ebooks. But some sites use ``<code>`` indefinitely, also for multiline codes. in these cases, adding ``<pre>`` rather unconditionally is one of the solution. As an arbirtary precaution, if parent or grandparent element tag is ``<pre>``, adding another ``<pre>`` is skipped. >>> el = fromstring('<code>aaabbb</code>') >>> parent = el.getparent() >>> code_to_pre_code(el) >>> tostring(parent[0]) '<code>aaabbb</code>' >>> el = fromstring(r'<code>aaa\nbbb</code>') >>> parent = el.getparent() >>> code_to_pre_code(el) >>> tostring(parent[0]) '<pre><code>aaa\\nbbb</code></pre>' """ for el in doc.xpath('//code'): if r'\n' in el.text_content(): if check_parent_tag(el, 'pre') is not None: continue wrap_tag(el, 'pre')
[docs]def add_hr(doc, path): """Add ``<hr>`` tag before some xpath element (``'path'``) in the document. >>> el = fromstring('<div><p>aaa</p><p>bbb</p></div>') >>> path = '(//p)[2]' >>> add_hr(el, path) >>> tostring(el) '<div><p>aaa</p><hr><p>bbb</p></div>' """ for el in doc.xpath(path): tag = make_tag('hr', '') insert_tag(el, tag)
[docs]def add_description(doc): """Add description from ``<meta>``.""" description = get_metadata(doc)['description'] desc = make_tag('p', '[ %s ]' % description) doc.body.insert(0, desc)
[docs]def _add_style(el, style): """Add inline style strings ('style') to element (Note: no doc). >>> el = fromstring('<p>aaa</p>') >>> _add_style(el, 'font-size: larger;') >>> tostring(el) '<p class="tsi-keep-style" style="font-size: larger;">aaa</p>' """ el.classes |= (KEEP_STYLE,) el.set('style', style)
[docs]def add_style(doc, path, style): """Add inline style strings ('style') to each xpath element ('path'). >>> el = fromstring('<div><p>aaa</p></div>') >>> add_style(el, '//p', 'font-size: larger;') >>> tostring(el) '<div><p class="tsi-keep-style" style="font-size: larger;">aaa</p></div>' """ for el in doc.xpath(path): _add_style(el, style)
[docs]def replace_tags(doc, path, tag='div'): """Change just the tagname while keeping anything inside. >>> doc = fromstring('<div><p>aaa</p>bbb</div>') >>> replace_tags(doc, '//div', 'h3') >>> tostring(doc) '<h3><p>aaa</p>bbb</h3>' """ for el in doc.xpath(path): el.tag = tag
[docs]def add_noscript_image(doc): """Move element inside <noscript> to outside. >>> doc = fromstring('<h3><noscript><div><img src="a.jpg"></div></noscript></h3>') # noqa: E501 >>> add_noscript_image(doc) >>> tostring(doc) '<h3><noscript><div></div></noscript><img src="a.jpg"></h3>' """ for el in doc.xpath('//noscript'): for element in el.iter(tag='img'): if 'src' in element.attrib: el.addnext(element)
# ---------------------------------------------------------- # Site Specific Functions
[docs]def hackernews_indent(doc): """Narrow default indent widths, they are too wide for e-readers.""" for d in doc.xpath('//td[@class="ind"]'): width = d.xpath('./img/@width') width = width[0] if width else '0' tr = d.getparent() # changing image width (px) to padding-left (px), # reducing number arbitrarily. block = make_tag('div') style = 'margin-bottom:1em;padding-left:%dpx;' % int(int(width) / 4) _add_style(block, style) comhead = tr.xpath('.//span[@class="comhead"]')[0] _add_style(comhead, 'font-weight:bold;') # removing unnecessary links user = comhead.xpath('./a[@class="hnuser"]') if user: user = user[0] del user.attrib['href'] user.tag = 'span' date = comhead.xpath('./span[@class="age"]') if date: date = date[0] a = date.xpath('./a') if a: a = a[0] date.text = a.text date.remove(a) togg = comhead.xpath('./a[@class=="togg"]') if togg: # simply removing 'togg' breaks tag structures, # maybe shouldn't touch direct child # comhead.remove(togg[0]) # NG togg[0].text = '' # '/threads?id=username' pattern onstory = comhead.xpath('./span[@class="onstory"]') if onstory: onstory = onstory[0] a = onstory.xpath('./a') if a: a = a[0] onstory.text += a.text onstory.remove(a) block.append(comhead) comment = tr.xpath('.//div[@class="comment"]') if comment: # sometimes comment is folded for c in comment: block.append(c) body = tr.getparent() body.replace(tr, block) # Add sitename hint to h1 replace_h1(doc, r'^(.+?) \| .*', r'hn - \1')
[docs]def reddit_indent(doc): """Narrow default indent widths, they are too wide for e-readers.""" for el in doc.xpath('//div[@class=="comment"]'): _add_style(el, 'margin-left:8px;') path = './/p[@class="tagline"]/a[@class=="author"]' for e in el.xpath(path): _add_style(e, 'font-weight:bold;') # Add sitename hint to h1 replace_h1(doc, r'^(.+)$', r'reddit - \1')
[docs]def github_self_anchor(doc): """Discard self anchors in <h3>. We stripped referents, and weasyprint warns it. """ for el in doc.xpath('(//h1|//h2|//h3|//h4)/a'): if 'anchor' in el.classes: # When setting 'href="#", weasyprint warns # 'WARNING: No anchor # for internal URI reference at line None' el.set('href', '')
[docs]def github_issues_comment_header(doc): """Change comment header blocks from <h3> to <div>. <h3> is too big here, clutters TOC. Also discard self anchors in date part of headers e.g. 'href="#issuecomment-223857939"'. We stripped referents, and weasyprint warns it. Also delete the repetetive sentence 'This comment...' (display: none). """ for el in doc.xpath('//h3'): if 'timeline-comment-header-text' in el.classes: el.tag = 'div' for element in el.xpath('.//a'): if 'timestamp' in element.classes: element.set('href', '') if 'This comment has been minimized.' in el.text: if 'text-gray' in el.classes: el.text = ''