Skip to content

Commit 307b04a

Browse files
committed
Support receiving data from Zip archives (#35)
1 parent 0edc322 commit 307b04a

File tree

6 files changed

+41
-24
lines changed

6 files changed

+41
-24
lines changed

README.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,12 @@ List and get files of the stage directory of a terminology.
171171

172172
#### POST /terminology/:id/receive
173173

174-
Receive terminology data. The location of the data is going to be extracted from terminology metadata from BARTOC but this has not been implemented yet. For now pass query parameter `from` instead to locate an URL or the name of a file in the data directory. Format must be RDF/Turtle for file extension `.ttl` or `.nt`, otherwise RDF/XML.
174+
Receive terminology data. The location of the data is going to be extracted from terminology metadata from BARTOC but this has not been implemented yet. For now pass query parameter `from` instead to locate an URL or the name of a file in the data directory. File format can be:
175+
176+
- RDF/Turtle for file extension `.ttl` or `.nt`
177+
- RDF/XML for file extension `.rdf` or `.xml`
178+
- JSKOS as newline delimited JSON for file extension `.ndjson`
179+
- A ZIP archive containing RDF files for file extension `.zip`
175180

176181
#### GET /terminology/:id/receive
177182

@@ -241,7 +246,11 @@ List and get files of the stage directory of a collection.
241246

242247
#### POST /collection/:id/receive
243248

244-
Receive and process collection data. The location of the data is taken from collection metadata field `access` if existing. The location can be overridden with optional query parameter `from` with an URL or a file name from local data directory.
249+
Receive and process collection data. The location of the data is taken from collection metadata field `access` if existing. The location can be overridden with optional query parameter `from` with an URL or a file name from local data directory. File format can be:
250+
251+
- RDF/Turtle for file extension `.ttl` or `.nt`
252+
- RDF/XML for file extension `.rdf` or `.xml`
253+
- A ZIP archive containing RDF files for file extension `.zip`
245254

246255
#### GET /collection/:id/receive
247256

lib/mappings.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ def save_mappings_stage(self, id, name, data):
6060
stage.mkdir(exist_ok=True)
6161
# TODO: guess format from first line
6262
# process_jskos_mappings(self, source, target, log):
63-
#fmt = ".ttl"
64-
#file = f"{name}{fmt}"
65-
#f = open(stage / file, "w")
66-
#f.write(data)
63+
# fmt = ".ttl"
64+
# file = f"{name}{fmt}"
65+
# f = open(stage / file, "w")
66+
# f.write(data)
6767

6868
def append(self, id, data):
6969
self.save_mappings_stage(id, "append")

lib/rdf.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,6 @@ def jsonld2nt(doc, context):
1818
return jsonld.to_rdf(expanded, options={'format': 'application/n-quads'})
1919

2020

21-
# for pretty-printing
22-
# Namespace prefixes for pretty RDF/Turtle
23-
# prefixes = read_json(Path(__file__).parent.parent / 'prefixes.json')
24-
#
25-
# def to_rdf(doc, context):
26-
# nquads = jsonld2nt(doc, context)
27-
# g = Graph(bind_namespaces="none")
28-
# for prefix, uri in prefixes.items():
29-
# g.bind(prefix, Namespace(uri))
30-
# g.parse(data=nquads, format='nquads')
31-
# return g
32-
33-
3421
class TripleStore:
3522
def __init__(self, api):
3623
self.api = api
@@ -86,11 +73,9 @@ def sparql_to_rdf(binding):
8673
return Literal(binding['value'])
8774

8875

89-
rdfparser = lightrdf.Parser()
90-
91-
9276
def triple_iterator(source, log):
9377
"""Recursively extract RDF triples from a file, directory and/or ZIP archive."""
78+
rdfparser = lightrdf.Parser()
9479
for name, path, archive in walk(source):
9580
format = None
9681
if name.endswith(".ttl"):
@@ -115,7 +100,8 @@ def triple_iterator(source, log):
115100
base = f"file://{file}"
116101

117102
try:
118-
log.append(f"Extracting RDF from {file} as {format}")
103+
log.append(f"Extracting RDF from {base} as {format}")
104+
# TODO: pass errors as warnings to logger instead of STDERR
119105
for triple in rdfparser.parse(file, format=format):
120106
yield triple
121107
except Exception as e:

lib/registry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ def identify_source(self, id, source=None):
169169
fmt = "xml"
170170
elif Path(source).suffix in [".ndjson"]:
171171
fmt = "ndjson"
172+
elif Path(source).suffix in [".zip", ".ZIP"]:
173+
fmt = "zip"
172174

173175
if not fmt:
174176
raise ClientError("Unknown data format")
@@ -200,6 +202,7 @@ def fetch_source(self, id, source, fmt):
200202
return (original, log)
201203

202204
def preprocess_source(self, id, file, fmt, log):
205+
"""Returned file must be RDF or ZIP (with RDF)."""
203206
return file
204207

205208
def receive_rdf(self, id, source, log):

tests/rdf.zip

2.24 KB
Binary file not shown.

tests/test_api.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pathlib import Path
77
import pytest
88

9+
from rdflib import Graph
910
from lib import TripleStore, read_json
1011
from app import app, init
1112

@@ -35,6 +36,17 @@
3536
"partOf": [base]
3637
}
3738

39+
# for pretty-printing
40+
# Namespace prefixes for pretty RDF/Turtle
41+
#def to_rdf(doc, context):
42+
# prefixes = read_json(Path(__file__).parent.parent / 'prefixes.json')
43+
# nquads = jsonld2nt(doc, context)
44+
# g = Graph(bind_namespaces="none")
45+
# for prefix, uri in prefixes.items():
46+
# g.bind(prefix, Namespace(uri))
47+
# g.parse(data=nquads, format='nquads')
48+
# return g
49+
3850

3951
def count_graphs():
4052
query = "SELECT ?g (count(*) as ?t) { GRAPH ?g {?s ?p ?o} } GROUP BY ?g"
@@ -306,6 +318,13 @@ def test_api(client):
306318
client.post('/collection/3/load')
307319
assert sparql.query(query) == graph[:1]
308320

321+
assert client.post('/collection/3/receive?from=rdf.zip').status_code == 200
322+
assert client.post('/collection/3/load').status_code == 200
323+
324+
query = f"SELECT ?e {{ GRAPH <{base}3> {{ ?e a ?t }} }} ORDER BY ?o"
325+
res = [r["e"]["value"] for r in sparql.query(query)]
326+
assert res == [f'https://example.org/e{i}' for i in [1, 2, 3, 4]]
327+
309328
assert client.post(
310329
'/terminology/1644/receive?from=crm.ttl').status_code == 200
311330

@@ -348,7 +367,7 @@ def test_mappings(client):
348367
f'Retrieving source {cwd}/tests/mappings.ndjson from data directory',
349368
'Converting JSKOS mappings to RDF mapping triples',
350369
'Processed 2 lines into 1 mappings',
351-
f'Extracting RDF from {stage}/mappings/1/original.ttl as turtle',
370+
f'Extracting RDF from file://{stage}/mappings/1/original.ttl as turtle',
352371
'Removed 0 triples, remaining 1 unique triples.',
353372
'done']
354373

0 commit comments

Comments
 (0)