Skip to content

Commit 502ffea

Browse files
Merge pull request #15 from weblyzard/feature/fix_unicode_python2.7
fix: unicode handling in python2.7, chg: pep8
2 parents 2b36a9e + 4d1bc4c commit 502ffea

File tree

2 files changed

+18
-15
lines changed

2 files changed

+18
-15
lines changed

src/inscriptis/html_engine.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
try:
1818
# python 2 compatibility
1919
from HTMLParser import HTMLParser
20+
unescape = HTMLParser().unescape
2021
except ImportError:
2122
from html import unescape
2223

@@ -53,7 +54,7 @@ def __init__(self, html_tree, display_images=False, deduplicate_captions=False,
5354
'li': self.start_li,
5455
'br': self.newline,
5556
'a': self.start_a if display_links else None,
56-
'img' :self.start_img if display_images else None,
57+
'img': self.start_img if display_images else None,
5758
}
5859
self.end_tag_handler_dict = {
5960
'table': self.end_table,
@@ -78,7 +79,7 @@ def __init__(self, html_tree, display_images=False, deduplicate_captions=False,
7879
self.current_table = []
7980
self.li_counter = []
8081
self.li_level = 0
81-
self.invisible = [] # a list of attributes that are considered invisible
82+
self.invisible = [] # a list of attributes that are considered invisible
8283
self.last_caption = None
8384

8485
# used if display_links is enabled
@@ -120,8 +121,8 @@ def write_line(self, force=False):
120121
'''
121122
# only break the line if there is any relevant content
122123
if not force and (not self.current_line[-1].content or self.current_line[-1].content.isspace()):
123-
self.current_line[-1].margin_before = max(self.current_line[-1].margin_before, \
124-
self.current_tag[-1].margin_before)
124+
self.current_line[-1].margin_before = max(self.current_line[-1].margin_before,
125+
self.current_tag[-1].margin_before)
125126
return False
126127

127128
line = self.current_line[-1].get_text()
@@ -141,7 +142,8 @@ def handle_starttag(self, tag, attrs):
141142

142143
cur = CSS.get(tag, Inscriptis.DEFAULT_ELEMENT)
143144
if 'style' in attrs:
144-
cur = CssParse.get_style_attribute(attrs['style'], html_element=cur)
145+
cur = CssParse.get_style_attribute(
146+
attrs['style'], html_element=cur)
145147
self.current_tag.append(cur)
146148
if cur.display == Display.none or self.invisible:
147149
self.invisible.append(cur)
@@ -151,10 +153,12 @@ def handle_starttag(self, tag, attrs):
151153
# flush text before display:block elements
152154
if cur.display == Display.block:
153155
if not self.write_line():
154-
self.current_line[-1].margin_before = max(self.current_line[-1].margin_before, cur.margin_before)
156+
self.current_line[-1].margin_before = max(
157+
self.current_line[-1].margin_before, cur.margin_before)
155158
self.current_line[-1].padding = self.next_line[-1].padding
156159
else:
157-
self.current_line[-1].margin_after = max(self.current_line[-1].margin_after, cur.margin_after)
160+
self.current_line[-1].margin_after = max(
161+
self.current_line[-1].margin_after, cur.margin_after)
158162

159163
handler = self.start_tag_handler_dict.get(tag, None)
160164
if handler:
@@ -167,7 +171,8 @@ def handle_endtag(self, tag):
167171
return
168172

169173
self.next_line[-1].padding = self.current_line[-1].padding - cur.padding
170-
self.current_line[-1].margin_after = max(self.current_line[-1].margin_after, cur.margin_after)
174+
self.current_line[-1].margin_after = max(
175+
self.current_line[-1].margin_after, cur.margin_after)
171176
# flush text after display:block elements
172177
if cur.display == Display.block:
173178
# propagate the new padding to the current line, if nothing has
@@ -193,7 +198,7 @@ def handle_data(self, data):
193198

194199
def start_ul(self, attrs):
195200
self.li_level += 1
196-
self.li_counter.append(Inscriptis.get_bullet(self.li_level-1))
201+
self.li_counter.append(Inscriptis.get_bullet(self.li_level - 1))
197202

198203
def end_ul(self):
199204
self.li_level -= 1
@@ -220,7 +225,6 @@ def end_ol(self):
220225
self.li_level -= 1
221226
self.li_counter.pop()
222227

223-
224228
def start_li(self, attrs):
225229
self.write_line()
226230
if self.li_level > 0:
@@ -287,5 +291,3 @@ def get_bullet(index):
287291
the bullet that corresponds to the given index
288292
'''
289293
return Inscriptis.UL_COUNTER[index % Inscriptis.UL_COUNTER_LEN]
290-
291-

src/inscriptis/table_engine.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python3
22
# encoding: utf-8
33

4+
45
class TableCell:
56
''' A single table cell '''
67

@@ -25,7 +26,7 @@ def get_format_spec(self):
2526
'''
2627
The format specification according to the values of `align` and `width`
2728
'''
28-
return "{{:{align}{width}}}".format(align=self.align, width=self.width)
29+
return u"{{:{align}{width}}}".format(align=self.align, width=self.width)
2930

3031
def get_text(self):
3132
text = '\n'.join(self.canvas).strip()
@@ -63,7 +64,8 @@ def compute_column_width(self):
6364

6465
for column_idx in range(max_columns):
6566
# determine max_column_width
66-
max_column_width = max([len(row.get_cell_text(column_idx)) for row in self.rows])
67+
max_column_width = max(
68+
[len(row.get_cell_text(column_idx)) for row in self.rows])
6769

6870
# set column width in all rows
6971
for row in self.rows:
@@ -93,7 +95,6 @@ def get_cell_text(self, column_idx):
9395
'''
9496
return '' if column_idx >= len(self.columns) else self.columns[column_idx].get_text()
9597

96-
9798
def get_text(self):
9899
'''
99100
::returns:

0 commit comments

Comments
 (0)