Merge pull request #21 from weblyzard/feature/render-profiles

AlbertWeichselbraun · web-flow · commit b0647370b8ac · 2019-09-25T15:02:27.000+02:00
Feature/render profiles
diff --git a/README.md b/README.md
@@ -113,24 +113,29 @@ curl -X POST  -H "Content-Type: text/html; encoding=UTF8" -d @test.html  http://
 
 The following options are available for fine tuning the way inscriptis translates HTML to text.
 
-1. **More rigorous indentation:** call `get_text()` with the parameter `indentation='extended'` to also use indentation for tags such as `<div>` and `<span>` that do not provide indentation in their standard definition. This strategy is the default in `inscript.py` and many other tools such as lynx. If you do not want extended indentation you can use the parameter `indentation='standard'` instead.
+1. **More rigorous indentation:** call `inscriptis.get_text()` with the parameter `indentation='extended'` to also use indentation for tags such as `<div>` and `<span>` that do not provide indentation in their standard definition. This strategy is the default in `inscript.py` and many other tools such as lynx. If you do not want extended indentation you can use the parameter `indentation='standard'` instead.
 
 2. **Overwriting the default CSS definition:** inscriptis uses CSS definitions that are maintained in `inscriptis.css.CSS` for rendering HTML tags. You can override these definitions (and therefore change the rendering) as outlined below:
 
    ```python
-   from inscriptis.css import CSS, HtmlElement
-   from inscriptis.html_properties import Display
+   from lxml.html import fromstring
 
-   # change the rendering of `div` and `span` elements
-   CSS['div'] = HtmlElement('div', display=Display.block, padding=2)
-   CSS['span'] = HtmlElement('span', prefix=' ', suffix=' ')
-   ```
-   The following code snippet restores the standard behaviour:
-   ```python
-   from inscriptis.css import CSS, DEFAULT_CSS
+   from inscriptis.css import DEFAULT_CSS, HtmlElement
+   from inscriptis.html_properties import Display
 
-   # restore standard behaviour
-   CSS = DEFAULT_CSS.copy()
+   # create a custom CSS based on the default style sheet and change the rendering of `div` and `span` elements
+   css = DEFAULT_CSS.copy()
+   css['div'] = HtmlElement('div', display=Display.block, padding=2)
+   css['span'] = HtmlElement('span', prefix=' ', suffix=' ')
+
+   html_tree = fromstring(html)
+   # create a parser using the custom css
+   parser = Inscriptis(html_tree,
+                       display_images=display_images,
+                       deduplicate_captions=deduplicate_captions,
+                       display_links=display_links,
+                       css=css)
+   text = parser.get_text()
    ```
 
 ## Testing, benchmarking and evaluation
diff --git a/scripts/inscript.py b/scripts/inscript.py
@@ -34,6 +34,7 @@ def get_parser():
     parser.add_argument('-i', '--display-image-captions', action='store_true', default=False, help='Display image captions (default:false).')
     parser.add_argument('-l', '--display-link-targets', action='store_true', default=False, help='Display link targets (default:false).')
     parser.add_argument('-d', '--deduplicate-image-captions', action='store_true', default=False, help='Deduplicate image captions (default:false).')
+    parser.add_argument('--indentation', default='standard', help='How to handle indentation (extended or standard; default: standard).')
     return parser
 
 
@@ -56,7 +57,8 @@ def get_parser():
     text = get_text(html_content,
                     display_images=args.display_image_captions,
                     deduplicate_captions=args.deduplicate_image_captions,
-                    display_links=args.display_link_targets)
+                    display_links=args.display_link_targets,
+                    indentation=args.indentation)
     if args.output:
         with open(args.output, 'w') as open_file:
             open_file.write(text)
diff --git a/setup.py b/setup.py
@@ -13,12 +13,12 @@
 setup(
     # Metadata
     name="inscriptis",
-    version="0.0.4.0",
+    version="0.0.4.1",
     description='inscriptis - HTML to text converter.',
     long_description=long_description,
     long_description_content_type='text/markdown',
     author='Albert Weichselbraun, Fabian Odoni',
-    author_email='albert.weichselbraun@htwchur.ch, fabian.odoni@htwchur.ch',
+    author_email='albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch',
     classifiers=[
            'Topic :: Text Processing :: Markup :: HTML',
            'Programming Language :: Python :: 3',
diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py
@@ -1,34 +1,49 @@
 
-from re import compile
+import re
 from lxml.html import fromstring
 
+from inscriptis.css import DEFAULT_CSS, HtmlElement
 from inscriptis.html_engine import Inscriptis
+from inscriptis.html_properties import Display
 
 __author__ = "Albert Weichselbraun, Fabian Odoni"
-__copyright__ = "Copyright (C) 2016 Albert Weichselbraun, Fabian Odoni"
+__copyright__ = "Copyright (C) 2016-2019 Albert Weichselbraun, Fabian Odoni"
 __license__ = "GPL"
 __version__ = "0.0.1"
 __maintainer__ = "Fabian Odoni"
 __email__ = "fabian.odoni@htwchur.ch"
 __status__ = "Prototype"
 
-RE_STRIP_XML_DECLARATION = compile(r'^<\?xml [^>]+?\?>')
+RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>')
 
-
-def get_text(html_content, display_images=False, deduplicate_captions=False, display_links=False):
+def get_text(html_content, display_images=False, deduplicate_captions=False,
+             display_links=False, indentation='extended'):
     '''
-    ::param: html_content
-    ::returns:
-        a text representation of the html content.
+    :param html_content: the html string to be converted to text
+    :param display_images: whether to display image caption
+    :param indentation: either 'standard' (solely based on the css) or 'extended'
+        which intends divs and adds spaces between span tags
     '''
     html_content = html_content.strip()
     if not html_content:
-        return ""
+        return ''
+
+    if indentation == 'extended':
+        css = DEFAULT_CSS.copy()
+        css['div'] = HtmlElement('div', display=Display.block, padding=2)
+        css['span'] = HtmlElement('span', prefix=' ', suffix=' ')
+    else:
+        css = DEFAULT_CSS
+
 
     # strip XML declaration, if necessary
     if html_content.startswith('<?xml '):
         html_content = RE_STRIP_XML_DECLARATION.sub('', html_content, count=1)
 
     html_tree = fromstring(html_content)
-    parser = Inscriptis(html_tree, display_images=display_images, deduplicate_captions=deduplicate_captions, display_links=display_links)
+    parser = Inscriptis(html_tree,
+                        display_images=display_images,
+                        deduplicate_captions=deduplicate_captions,
+                        display_links=display_links,
+                        css=css)
     return parser.get_text()
diff --git a/src/inscriptis/css.py b/src/inscriptis/css.py
@@ -38,15 +38,14 @@ def __str__(self):
         return "<{tag} prefix={prefix}, suffix={suffix}, display={display}, " \
                "margin_before={margin_before}, margin_after={margin_after}, " \
                "padding={padding}, whitespace={whitespace}>".format(
-                  tag=self.tag,
-                  prefix=self.prefix,
-                  suffix=self.suffix,
-                  display=self.display,
-                  margin_before=self.margin_before,
-                  margin_after=self.margin_after,
-                  padding=self.padding,
-                  whitespace=self.whitespace)
-
+                   tag=self.tag,
+                   prefix=self.prefix,
+                   suffix=self.suffix,
+                   display=self.display,
+                   margin_before=self.margin_before,
+                   margin_after=self.margin_after,
+                   padding=self.padding,
+                   whitespace=self.whitespace)
 
 class CssParse(object):
     '''
@@ -103,8 +102,7 @@ def _get_em(length):
 
         if unit not in ('em', 'qem', 'rem'):
             return int(round(value/8))
-        else:
-            return int(round(value))
+        return int(round(value))
 
     # ------------------------------------------------------------------------
     # css styles
@@ -140,7 +138,7 @@ def _attr_padding_left(value, html_element):
     _attr_padding_start = _attr_padding_left
 
 
-CSS = {
+DEFAULT_CSS = {
     'head': HtmlElement('head', display=Display.none),
     'link': HtmlElement('link', display=Display.none),
     'meta': HtmlElement('meta', display=Display.none),
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
@@ -10,7 +10,7 @@
 '''
 from itertools import chain
 
-from inscriptis.css import CSS, CssParse, HtmlElement
+from inscriptis.css import DEFAULT_CSS, CssParse, HtmlElement
 from inscriptis.html_properties import Display, WhiteSpace, Line
 from inscriptis.table_engine import Table
 
@@ -29,7 +29,7 @@ class Inscriptis(object):
 
     DEFAULT_ELEMENT = HtmlElement()
 
-    def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False):
+    def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False, css=None):
         '''
         ::param: display_images \
             whether to include image tiles/alt texts
@@ -42,6 +42,7 @@ def __init__(self, html_tree, display_images=False, deduplicate_captions=False,
         '''
         # setup config
         self.cfg_deduplicate_captions = deduplicate_captions
+        self.css = css if css else DEFAULT_CSS
 
         # setup start and end tag call tables
         self.start_tag_handler_dict = {
@@ -140,7 +141,7 @@ def write_line_verbatim(self, text):
     def handle_starttag(self, tag, attrs):
         # use the css to handle tags known to it :)
 
-        cur = CSS.get(tag, Inscriptis.DEFAULT_ELEMENT)
+        cur = self.css.get(tag, Inscriptis.DEFAULT_ELEMENT)
         if 'style' in attrs:
             cur = CssParse.get_style_attribute(
                 attrs['style'], html_element=cur)
@@ -192,6 +193,9 @@ def handle_data(self, data):
         if self.current_tag[-1].whitespace == WhiteSpace.pre:
             data = '\0' + data + '\0'
 
+        # add prefix, if present
+        data = self.current_tag[-1].prefix + data + self.current_tag[-1].suffix
+
         # determine whether to add this content to a table column
         # or to a standard line
         self.current_line[-1].content += data
diff --git a/src/inscriptis/html_properties.py b/src/inscriptis/html_properties.py
@@ -30,7 +30,6 @@ def extract_pre_text(self):
         pass
 
     def get_text(self):
-        # print(">>" + self.content + "<< before: " + str(self.margin_before) + ", after: " + str(self.margin_after) + ", padding: ", self.padding, ", list: ", self.list_bullet)
         return ''.join(('\n' * self.margin_before,
                         ' ' * (self.padding - len(self.list_bullet)),
                         self.list_bullet,
diff --git a/tests/test_html_snippets.py b/tests/test_html_snippets.py
@@ -28,7 +28,7 @@ def test_html_snippets(filter_str=''):
             print(f.name)
             html = u"<html><body>{}</body></html>".format(f.read())
 
-        converted_txt = get_text(html).rstrip()
+        converted_txt = get_text(html, indentation='standard').rstrip()
 
         if converted_txt != reference_txt:
             print (u"File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".format(testcase_txt, html, reference_txt, converted_txt))
diff --git a/tests/test_list_div.py b/tests/test_list_div.py
@@ -10,18 +10,18 @@
 
 def test_divs():
     html = u'<body>Thomas<div>Anton</div>Maria</body>'
-    assert get_text(html) == u'Thomas\nAnton\nMaria'
+    assert get_text(html, indentation='standard') == u'Thomas\nAnton\nMaria'
 
     html = u'<body>Thomas<div>Anna <b>läuft</b> weit weg.</div>'
-    assert get_text(html) == u'Thomas\nAnna läuft weit weg.'
+    assert get_text(html, indentation='standard') == u'Thomas\nAnna läuft weit weg.'
 
     html = u'<body>Thomas <ul><li><div>Anton</div>Maria</ul></body>'
-    assert get_text(html) == u'Thomas\n  * Anton\n    Maria'
+    assert get_text(html, indentation='standard') == u'Thomas\n  * Anton\n    Maria'
 
     html = u'<body>Thomas <ul><li>  <div>Anton</div>Maria</ul></body>'
-    assert get_text(html) == u'Thomas\n  * Anton\n    Maria'
+    assert get_text(html, indentation='standard') == u'Thomas\n  * Anton\n    Maria'
 
     html = u'<body>Thomas <ul><li> a  <div>Anton</div>Maria</ul></body>'
-    assert get_text(html) == u'Thomas\n  * a\n    Anton\n    Maria'
+    assert get_text(html, indentation='standard') == u'Thomas\n  * a\n    Anton\n    Maria'