Skip to content

Commit 4283a02

Browse files
committed
Add xpath text baseline (without output as it's huge)
1 parent 8c1f1bd commit 4283a02

File tree

1 file changed

+30
-0
lines changed

1 file changed

+30
-0
lines changed

run_xpath_text.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python3
2+
import gzip
3+
import json
4+
from pathlib import Path
5+
6+
import lxml.html
7+
8+
9+
def xpath_text(html: str) -> str:
10+
root = lxml.html.fromstring(html)
11+
bodies = root.xpath('//body')
12+
if bodies:
13+
root = bodies[0]
14+
return ' '.join(root.xpath('.//text()'))
15+
16+
17+
def main():
18+
output = {}
19+
for path in Path('html').glob('*.html.gz'):
20+
with gzip.open(path, 'rt', encoding='utf8') as f:
21+
html = f.read()
22+
item_id = path.stem.split('.')[0]
23+
output[item_id] = {'articleBody': xpath_text(html)}
24+
(Path('output') / 'xpath-text.json').write_text(
25+
json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
26+
encoding='utf8')
27+
28+
29+
if __name__ == '__main__':
30+
main()

0 commit comments

Comments
 (0)