Skip to content

Commit b064737

Browse files
Merge pull request #21 from weblyzard/feature/render-profiles
Feature/render profiles
2 parents b3a205a + af37a2d commit b064737

File tree

9 files changed

+70
-47
lines changed

9 files changed

+70
-47
lines changed

README.md

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -113,24 +113,29 @@ curl -X POST -H "Content-Type: text/html; encoding=UTF8" -d @test.html http://
113113

114114
The following options are available for fine tuning the way inscriptis translates HTML to text.
115115

116-
1. **More rigorous indentation:** call `get_text()` with the parameter `indentation='extended'` to also use indentation for tags such as `<div>` and `<span>` that do not provide indentation in their standard definition. This strategy is the default in `inscript.py` and many other tools such as lynx. If you do not want extended indentation you can use the parameter `indentation='standard'` instead.
116+
1. **More rigorous indentation:** call `inscriptis.get_text()` with the parameter `indentation='extended'` to also use indentation for tags such as `<div>` and `<span>` that do not provide indentation in their standard definition. This strategy is the default in `inscript.py` and many other tools such as lynx. If you do not want extended indentation you can use the parameter `indentation='standard'` instead.
117117

118118
2. **Overwriting the default CSS definition:** inscriptis uses CSS definitions that are maintained in `inscriptis.css.CSS` for rendering HTML tags. You can override these definitions (and therefore change the rendering) as outlined below:
119119

120120
```python
121-
from inscriptis.css import CSS, HtmlElement
122-
from inscriptis.html_properties import Display
121+
from lxml.html import fromstring
123122

124-
# change the rendering of `div` and `span` elements
125-
CSS['div'] = HtmlElement('div', display=Display.block, padding=2)
126-
CSS['span'] = HtmlElement('span', prefix=' ', suffix=' ')
127-
```
128-
The following code snippet restores the standard behaviour:
129-
```python
130-
from inscriptis.css import CSS, DEFAULT_CSS
123+
from inscriptis.css import DEFAULT_CSS, HtmlElement
124+
from inscriptis.html_properties import Display
131125

132-
# restore standard behaviour
133-
CSS = DEFAULT_CSS.copy()
126+
# create a custom CSS based on the default style sheet and change the rendering of `div` and `span` elements
127+
css = DEFAULT_CSS.copy()
128+
css['div'] = HtmlElement('div', display=Display.block, padding=2)
129+
css['span'] = HtmlElement('span', prefix=' ', suffix=' ')
130+
131+
html_tree = fromstring(html)
132+
# create a parser using the custom css
133+
parser = Inscriptis(html_tree,
134+
display_images=display_images,
135+
deduplicate_captions=deduplicate_captions,
136+
display_links=display_links,
137+
css=css)
138+
text = parser.get_text()
134139
```
135140

136141
## Testing, benchmarking and evaluation

scripts/inscript.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def get_parser():
3434
parser.add_argument('-i', '--display-image-captions', action='store_true', default=False, help='Display image captions (default:false).')
3535
parser.add_argument('-l', '--display-link-targets', action='store_true', default=False, help='Display link targets (default:false).')
3636
parser.add_argument('-d', '--deduplicate-image-captions', action='store_true', default=False, help='Deduplicate image captions (default:false).')
37+
parser.add_argument('--indentation', default='standard', help='How to handle indentation (extended or standard; default: standard).')
3738
return parser
3839

3940

@@ -56,7 +57,8 @@ def get_parser():
5657
text = get_text(html_content,
5758
display_images=args.display_image_captions,
5859
deduplicate_captions=args.deduplicate_image_captions,
59-
display_links=args.display_link_targets)
60+
display_links=args.display_link_targets,
61+
indentation=args.indentation)
6062
if args.output:
6163
with open(args.output, 'w') as open_file:
6264
open_file.write(text)

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@
1313
setup(
1414
# Metadata
1515
name="inscriptis",
16-
version="0.0.4.0",
16+
version="0.0.4.1",
1717
description='inscriptis - HTML to text converter.',
1818
long_description=long_description,
1919
long_description_content_type='text/markdown',
2020
author='Albert Weichselbraun, Fabian Odoni',
21-
author_email='albert.weichselbraun@htwchur.ch, fabian.odoni@htwchur.ch',
21+
author_email='albert.weichselbraun@fhgr.ch, fabian.odoni@fhgr.ch',
2222
classifiers=[
2323
'Topic :: Text Processing :: Markup :: HTML',
2424
'Programming Language :: Python :: 3',

src/inscriptis/__init__.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,49 @@
11

2-
from re import compile
2+
import re
33
from lxml.html import fromstring
44

5+
from inscriptis.css import DEFAULT_CSS, HtmlElement
56
from inscriptis.html_engine import Inscriptis
7+
from inscriptis.html_properties import Display
68

79
__author__ = "Albert Weichselbraun, Fabian Odoni"
8-
__copyright__ = "Copyright (C) 2016 Albert Weichselbraun, Fabian Odoni"
10+
__copyright__ = "Copyright (C) 2016-2019 Albert Weichselbraun, Fabian Odoni"
911
__license__ = "GPL"
1012
__version__ = "0.0.1"
1113
__maintainer__ = "Fabian Odoni"
1214
__email__ = "[email protected]"
1315
__status__ = "Prototype"
1416

15-
RE_STRIP_XML_DECLARATION = compile(r'^<\?xml [^>]+?\?>')
17+
RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>')
1618

17-
18-
def get_text(html_content, display_images=False, deduplicate_captions=False, display_links=False):
19+
def get_text(html_content, display_images=False, deduplicate_captions=False,
20+
display_links=False, indentation='extended'):
1921
'''
20-
::param: html_content
21-
::returns:
22-
a text representation of the html content.
22+
:param html_content: the html string to be converted to text
23+
:param display_images: whether to display image caption
24+
:param indentation: either 'standard' (solely based on the css) or 'extended'
25+
which intends divs and adds spaces between span tags
2326
'''
2427
html_content = html_content.strip()
2528
if not html_content:
26-
return ""
29+
return ''
30+
31+
if indentation == 'extended':
32+
css = DEFAULT_CSS.copy()
33+
css['div'] = HtmlElement('div', display=Display.block, padding=2)
34+
css['span'] = HtmlElement('span', prefix=' ', suffix=' ')
35+
else:
36+
css = DEFAULT_CSS
37+
2738

2839
# strip XML declaration, if necessary
2940
if html_content.startswith('<?xml '):
3041
html_content = RE_STRIP_XML_DECLARATION.sub('', html_content, count=1)
3142

3243
html_tree = fromstring(html_content)
33-
parser = Inscriptis(html_tree, display_images=display_images, deduplicate_captions=deduplicate_captions, display_links=display_links)
44+
parser = Inscriptis(html_tree,
45+
display_images=display_images,
46+
deduplicate_captions=deduplicate_captions,
47+
display_links=display_links,
48+
css=css)
3449
return parser.get_text()

src/inscriptis/css.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,14 @@ def __str__(self):
3838
return "<{tag} prefix={prefix}, suffix={suffix}, display={display}, " \
3939
"margin_before={margin_before}, margin_after={margin_after}, " \
4040
"padding={padding}, whitespace={whitespace}>".format(
41-
tag=self.tag,
42-
prefix=self.prefix,
43-
suffix=self.suffix,
44-
display=self.display,
45-
margin_before=self.margin_before,
46-
margin_after=self.margin_after,
47-
padding=self.padding,
48-
whitespace=self.whitespace)
49-
41+
tag=self.tag,
42+
prefix=self.prefix,
43+
suffix=self.suffix,
44+
display=self.display,
45+
margin_before=self.margin_before,
46+
margin_after=self.margin_after,
47+
padding=self.padding,
48+
whitespace=self.whitespace)
5049

5150
class CssParse(object):
5251
'''
@@ -103,8 +102,7 @@ def _get_em(length):
103102

104103
if unit not in ('em', 'qem', 'rem'):
105104
return int(round(value/8))
106-
else:
107-
return int(round(value))
105+
return int(round(value))
108106

109107
# ------------------------------------------------------------------------
110108
# css styles
@@ -140,7 +138,7 @@ def _attr_padding_left(value, html_element):
140138
_attr_padding_start = _attr_padding_left
141139

142140

143-
CSS = {
141+
DEFAULT_CSS = {
144142
'head': HtmlElement('head', display=Display.none),
145143
'link': HtmlElement('link', display=Display.none),
146144
'meta': HtmlElement('meta', display=Display.none),

src/inscriptis/html_engine.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
'''
1111
from itertools import chain
1212

13-
from inscriptis.css import CSS, CssParse, HtmlElement
13+
from inscriptis.css import DEFAULT_CSS, CssParse, HtmlElement
1414
from inscriptis.html_properties import Display, WhiteSpace, Line
1515
from inscriptis.table_engine import Table
1616

@@ -29,7 +29,7 @@ class Inscriptis(object):
2929

3030
DEFAULT_ELEMENT = HtmlElement()
3131

32-
def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False):
32+
def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False, css=None):
3333
'''
3434
::param: display_images \
3535
whether to include image tiles/alt texts
@@ -42,6 +42,7 @@ def __init__(self, html_tree, display_images=False, deduplicate_captions=False,
4242
'''
4343
# setup config
4444
self.cfg_deduplicate_captions = deduplicate_captions
45+
self.css = css if css else DEFAULT_CSS
4546

4647
# setup start and end tag call tables
4748
self.start_tag_handler_dict = {
@@ -140,7 +141,7 @@ def write_line_verbatim(self, text):
140141
def handle_starttag(self, tag, attrs):
141142
# use the css to handle tags known to it :)
142143

143-
cur = CSS.get(tag, Inscriptis.DEFAULT_ELEMENT)
144+
cur = self.css.get(tag, Inscriptis.DEFAULT_ELEMENT)
144145
if 'style' in attrs:
145146
cur = CssParse.get_style_attribute(
146147
attrs['style'], html_element=cur)
@@ -192,6 +193,9 @@ def handle_data(self, data):
192193
if self.current_tag[-1].whitespace == WhiteSpace.pre:
193194
data = '\0' + data + '\0'
194195

196+
# add prefix, if present
197+
data = self.current_tag[-1].prefix + data + self.current_tag[-1].suffix
198+
195199
# determine whether to add this content to a table column
196200
# or to a standard line
197201
self.current_line[-1].content += data

src/inscriptis/html_properties.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ def extract_pre_text(self):
3030
pass
3131

3232
def get_text(self):
33-
# print(">>" + self.content + "<< before: " + str(self.margin_before) + ", after: " + str(self.margin_after) + ", padding: ", self.padding, ", list: ", self.list_bullet)
3433
return ''.join(('\n' * self.margin_before,
3534
' ' * (self.padding - len(self.list_bullet)),
3635
self.list_bullet,

tests/test_html_snippets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_html_snippets(filter_str=''):
2828
print(f.name)
2929
html = u"<html><body>{}</body></html>".format(f.read())
3030

31-
converted_txt = get_text(html).rstrip()
31+
converted_txt = get_text(html, indentation='standard').rstrip()
3232

3333
if converted_txt != reference_txt:
3434
print (u"File:{}\nHTML:\n{}\n\nReference:\n{}\n\nConverted:\n{}".format(testcase_txt, html, reference_txt, converted_txt))

tests/test_list_div.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,18 @@
1010

1111
def test_divs():
1212
html = u'<body>Thomas<div>Anton</div>Maria</body>'
13-
assert get_text(html) == u'Thomas\nAnton\nMaria'
13+
assert get_text(html, indentation='standard') == u'Thomas\nAnton\nMaria'
1414

1515
html = u'<body>Thomas<div>Anna <b>läuft</b> weit weg.</div>'
16-
assert get_text(html) == u'Thomas\nAnna läuft weit weg.'
16+
assert get_text(html, indentation='standard') == u'Thomas\nAnna läuft weit weg.'
1717

1818
html = u'<body>Thomas <ul><li><div>Anton</div>Maria</ul></body>'
19-
assert get_text(html) == u'Thomas\n * Anton\n Maria'
19+
assert get_text(html, indentation='standard') == u'Thomas\n * Anton\n Maria'
2020

2121
html = u'<body>Thomas <ul><li> <div>Anton</div>Maria</ul></body>'
22-
assert get_text(html) == u'Thomas\n * Anton\n Maria'
22+
assert get_text(html, indentation='standard') == u'Thomas\n * Anton\n Maria'
2323

2424
html = u'<body>Thomas <ul><li> a <div>Anton</div>Maria</ul></body>'
25-
assert get_text(html) == u'Thomas\n * a\n Anton\n Maria'
25+
assert get_text(html, indentation='standard') == u'Thomas\n * a\n Anton\n Maria'
2626

2727

0 commit comments

Comments
 (0)