Skip to content

Commit 592f17a

Browse files
committed
Refactoring
1 parent 56c1ba7 commit 592f17a

File tree

10 files changed

+105
-166
lines changed

10 files changed

+105
-166
lines changed

app.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from flask import Flask, jsonify, request, render_template, send_from_directory, send_file
22
from waitress import serve
3-
from lib import CollectionRegistry, TerminologyRegistry, MappingRegistry, ApiError, NotFound, ValidationError, TripleStore
3+
from lib import CollectionRegistry, TerminologyRegistry, MappingRegistry, \
4+
ApiError, NotFound, ValidationError, TripleStore
45
import argparse
56
import os
67
from pathlib import Path
@@ -18,7 +19,6 @@ def init(**config):
1819
global collections
1920
global terminologies
2021
global mappings
21-
global sparql
2222

2323
title = config.get('title', os.getenv('TITLE', 'N4O Graph Importer'))
2424

@@ -67,9 +67,9 @@ def status():
6767
values = {key: str(val) for key, val in app.config.items() if key.islower()}
6868
try:
6969
sparql = TripleStore(app.config['sparql'])
70-
sparql.insert(app.config['base']+'collection/', '')
70+
sparql.insert(f"{app.config['base']}collection/", '')
7171
values['connected'] = True
72-
except Exception as e:
72+
except Exception:
7373
values['connected'] = False
7474
return values
7575

extract-rdf.py

Lines changed: 0 additions & 38 deletions
This file was deleted.

lib/__init__.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
from .walk import walk, zipwalk
2-
from .extract import extractRDF
31
from .collections import CollectionRegistry
42
from .terminologies import TerminologyRegistry
53
from .mappings import MappingRegistry
64
from .errors import ApiError, NotFound, NotAllowed, ValidationError, ServerError, ClientError
75
from .utils import read_json, write_json
8-
from .rdf import TripleStore
6+
from .rdf import TripleStore, triple_iterator
97

108

11-
__all__ = [walk, zipwalk, extractRDF, CollectionRegistry, TerminologyRegistry,
9+
__all__ = [CollectionRegistry, TerminologyRegistry, triple_iterator,
1210
MappingRegistry, ApiError, NotFound, NotAllowed, read_json,
1311
write_json, ValidationError, ServerError, ClientError, TripleStore]

lib/extract.py

Lines changed: 0 additions & 38 deletions
This file was deleted.

lib/mappings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class MappingRegistry(Registry):
2525
def __init__(self, **config):
2626
super().__init__("mappings", **config)
2727

28-
def process_received(self, id, original, fmt, log):
28+
def preprocess_source(self, id, original, fmt, log):
2929
if fmt == "ndjson":
3030
log.append("Converting JSKOS mappings to RDF mapping triples")
3131
source = open(original)
@@ -41,10 +41,10 @@ def process_received(self, id, original, fmt, log):
4141
if type(f) is not list or type(t) is not list or len(f) != 1 or len(t) != 1:
4242
continue
4343
target.write(f"<{f[0]['uri']}> <{prop}> <{t[0]['uri']}> .\n")
44-
except Exception as e:
44+
except Exception:
4545
raise ValidationError("Failed to convert JSKOS mappings!")
4646
# TODO: log number of triples
4747

4848
return original
4949

50-
# TODO: def process_received_rdf to further filter triples from graph
50+
# TODO: def preprocess_source_rdf to further filter triples from graph

lib/rdf.py

Lines changed: 36 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
from rdflib import Graph, URIRef, Literal, BNode
1+
from rdflib import URIRef, Literal, BNode
22
from SPARQLWrapper import SPARQLWrapper
33
import requests
44
from pyld import jsonld
55
from .errors import ServerError
6+
from .walk import walk
7+
import lightrdf
68

79

810
def jsonld2nt(doc, context):
@@ -84,43 +86,38 @@ def sparql_to_rdf(binding):
8486
return Literal(binding['value'])
8587

8688

87-
def rdf_receive(source, path, log, namespaces={}, properties=[]):
88-
namespaces = tuple(list(namespaces.values()))
89-
90-
graph = Graph()
91-
graph.parse(source)
92-
size = len(graph)
93-
log.append(f"Parsed {size} unique triples")
94-
95-
checked = open(path / "checked.nt", "w")
96-
removed = open(path / "removed.nt", "w")
89+
rdfparser = lightrdf.Parser()
90+
91+
92+
def triple_iterator(source, log):
93+
"""Recursively extract RDF triples from a file, directory and/or ZIP archive."""
94+
for name, path, archive in walk(source):
95+
format = None
96+
if name.endswith(".ttl"):
97+
format = "turtle"
98+
elif name.endswith(".nt"):
99+
format = "nt"
100+
elif name.endswith(".owl"):
101+
format = "owl"
102+
elif name.endswith(".rdf"):
103+
format = "xml"
104+
elif name.endswith(".xml"):
105+
# TODO: check whether it's RDF/XML?
106+
format = "xml"
107+
else:
108+
continue
97109

98-
count = 0
99-
for s, p, o in graph.triples((None, None, None)):
100-
if str(s).startswith(namespaces):
101-
removed.write(f"{s.n3()} {p.n3()} {o.n3()} .\n")
110+
if archive:
111+
file = archive.open(name)
112+
base = f"file://{file.name}"
102113
else:
103-
count = count + 1
104-
# TODO: filter out namespaces
105-
# if predicate.startswith(rdflib.RDFS)
106-
checked.write(f"{s.n3()} {p.n3()} {o.n3()} .\n")
107-
108-
log.append(
109-
f"Removed {size - count} triples, remaining {count} unique triples.")
110-
111-
112-
"""
113-
checked=$stage/checked.nt
114-
removed=$stage/removed.nt
115-
namespace=$(jq -r --arg uri "$uri" '.[$uri]' $STAGE/terminology/namespaces.json)
116-
117-
export RDFFILTER_ALLOW_NAMESPACE=$namespace
118-
npm run --silent -- rdffilter $unique -o $checked --stats -f ./js/rdffilter.js
119-
# TODO: extend rdffilter for one-pass
120-
npm run --silent -- rdffilter $unique -o $removed -r -f ./js/rdffilter.js
121-
unset RDFFILTER_ALLOW_NAMESPACE
122-
123-
wc -l $duplicated
124-
wc -l $removed
125-
wc -l $checked
126-
"""
114+
file = f"{'/'.join(path)}/{name}"
115+
base = f"file://{file}"
116+
117+
try:
118+
log.append(f"Extracting RDF from {file} as {format}")
119+
for triple in rdfparser.parse(file, format=format):
120+
yield triple
121+
except Exception as e:
122+
log.append(f"Error parsing {base}: {e}")
123+
continue

lib/registry.py

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from shutil import copy, copyfileobj, rmtree
44
import urllib
55
from jsonschema import validate
6-
from .rdf import rdf_receive, jsonld2nt, TripleStore
6+
from .rdf import jsonld2nt, TripleStore, triple_iterator
77
from .log import Log
88
from .errors import NotFound, ClientError, ValidationError
99
from .utils import read_json, write_json, access_location
@@ -105,7 +105,7 @@ def delete(self, id):
105105
rmtree(self.stage / str(id), ignore_errors=True)
106106

107107
def purge(self):
108-
for id in [t["uri"].split("/")[-1] for t in self.list()]:
108+
for id in [t["id"] for t in self.list()]:
109109
self.delete(id)
110110

111111
def load(self, id):
@@ -139,40 +139,13 @@ def forbidden_namespaces(self, id):
139139
return {}
140140

141141
def receive(self, id, file=None):
142-
file, fmt = self.get_source(id, file)
143-
original, log = self.receive_source(id, file, fmt)
144-
stage = self.stage / str(id)
145-
file = self.process_received(id, original, fmt, log)
146-
namespaces = self.forbidden_namespaces(id)
147-
rdf_receive(file, stage, log, namespaces)
148-
142+
file, fmt = self.identify_source(id, file)
143+
original, log = self.fetch_source(id, file, fmt)
144+
file = self.preprocess_source(id, original, fmt, log)
145+
self.receive_rdf(id, file, log)
149146
return log.done()
150147

151-
def receive_source(self, id, source, fmt):
152-
stage = self.stage / str(id)
153-
stage.mkdir(exist_ok=True)
154-
155-
original = stage / f"original.{fmt}"
156-
log = Log(stage / "receive.json", f"Receiving {id} from {source}")
157-
158-
try:
159-
if "/" not in source:
160-
source = self.data / source
161-
log.append(f"Retrieving source {source} from data directory")
162-
copy(source, original)
163-
else:
164-
# TODO: source may be a DOI or similar identifier
165-
# ./extract-rdf.py $download_dir $stage/triples.nt
166-
log.append(f"Retrieving source from {source}")
167-
with urllib.request.urlopen(source) as fsrc, open(original, 'wb') as fdst:
168-
copyfileobj(fsrc, fdst)
169-
except Exception as e:
170-
log.done(f"Retrieving failed: {e}")
171-
raise NotFound(f"{source} not found")
172-
173-
return (original, log)
174-
175-
def get_source(self, id, source=None):
148+
def identify_source(self, id, source=None):
176149
item = self.get(id)
177150
fmt = None
178151

@@ -188,7 +161,7 @@ def get_source(self, id, source=None):
188161
elif fmt == "rdf/xml":
189162
fmt = "xml"
190163

191-
# TODO: configure and extend this
164+
# TODO: configure and extend this. Add support of .zip files
192165
if not fmt:
193166
if Path(source).suffix in [".nt", ".ttl"]:
194167
fmt = "ttl"
@@ -202,5 +175,53 @@ def get_source(self, id, source=None):
202175

203176
return source, fmt
204177

205-
def process_received(self, id, file, fmt, log):
178+
def fetch_source(self, id, source, fmt):
179+
stage = self.stage / str(id)
180+
stage.mkdir(exist_ok=True)
181+
182+
original = stage / f"original.{fmt}"
183+
log = Log(stage / "receive.json", f"Receiving {id} from {source}")
184+
185+
try:
186+
if "/" not in source:
187+
source = self.data / source
188+
log.append(f"Retrieving source {source} from data directory")
189+
copy(source, original)
190+
else:
191+
# TODO: source may be a DOI or similar identifier
192+
# ./extract-rdf.py $download_dir $stage/triples.nt
193+
log.append(f"Retrieving source from {source}")
194+
with urllib.request.urlopen(source) as fsrc, open(original, 'wb') as fdst:
195+
copyfileobj(fsrc, fdst)
196+
except Exception as e:
197+
log.done(f"Retrieving failed: {e}")
198+
raise NotFound(f"{source} not found")
199+
200+
return (original, log)
201+
202+
def preprocess_source(self, id, file, fmt, log):
206203
return file
204+
205+
def receive_rdf(self, id, source, log):
206+
namespaces = tuple(list(self.forbidden_namespaces(id).values()))
207+
208+
stage = self.stage / str(id)
209+
checked = open(stage / "checked.nt", "w")
210+
removed = open(stage / "removed.nt", "w")
211+
212+
okCount, removedCount = 0, 0
213+
for s, p, o in triple_iterator(source, log):
214+
# TODO: implement more filtering and rewrite
215+
if str(s)[1:].startswith(namespaces):
216+
removedCount = removedCount + 1
217+
removed.write(f"{s} {p} {o} .\n")
218+
else:
219+
okCount = okCount + 1
220+
# TODO: filter out namespaces
221+
# if predicate.startswith(rdflib.RDFS)
222+
checked.write(f"{s} {p} {o} .\n")
223+
224+
log.append(
225+
f"Removed {removedCount} triples, remaining {okCount} unique triples.")
226+
227+
# TODO: if okCount is zero, raise an error

lib/terminologies.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def register(self, item):
4040

4141
return super().register(voc[0], id)
4242

43-
def process_received(self, id, original, fmt, log):
43+
def preprocess_source(self, id, original, fmt, log):
4444
if fmt == "ndjson":
4545
log.append("Converting JSKOS to RDF")
4646
with open(original) as file:

lib/walk.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import os
22
from zipfile import ZipFile, ZipExtFile
3+
from pathlib import Path
34

45

56
def isZip(file) -> bool:
67
"""Check whether given file looks like a ZIP archive."""
7-
return file.endswith(".zip") or file.endswith(".ZIP")
8+
return Path(file).suffix == ".zip" or Path(file).suffix == ".ZIP"
89

910

1011
def zipwalk(file, path=None) -> list:

tests/test_api.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,5 +336,3 @@ def test_mappings(client):
336336
# TODO: check contents
337337

338338
assert client.get("/mappings/2/stage/").status_code == 200
339-
340-

0 commit comments

Comments
 (0)