Skip to content

Commit a86a6d9

Browse files
authored
Merge pull request #3970 from kobotoolbox/3967-paired-data-slow-performance
Generate the external XML data of connected parent projects faster
2 parents a9556ec + 221566f commit a86a6d9

File tree

7 files changed

+75
-12
lines changed

7 files changed

+75
-12
lines changed

kobo/apps/reports/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@
99
DEFAULT_REPORTS_KEY = 'default'
1010

1111
FUZZY_VERSION_ID_KEY = '_version_'
12-
INFERRED_VERSION_ID_KEY = '__inferred_version__'
12+
FUZZY_VERSION_PATTERN = r'^__?version__?(\d{3})?$'
13+
INFERRED_VERSION_ID_KEY = '__inferred_version__'

kpi/models/paired_data.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def __init__(
6666
self.paired_data_uid = paired_data_uid
6767

6868
self._asset_file = None
69+
self._source_asset = None
6970

7071
def __str__(self):
7172
return f'<PairedData {self.paired_data_uid} ({self.filename})>'
@@ -156,7 +157,11 @@ def get_download_url(self, request):
156157
args=(self.asset.uid, self.paired_data_uid, 'xml'),
157158
request=request)
158159

159-
def get_source(self) -> Union['Asset', None]:
160+
def get_source(self, force: bool = False) -> Union['Asset', None]:
161+
# if `self._source_asset` has been already set once, use the cache
162+
# object instead of fetching it from DB again.
163+
if not force and self._source_asset:
164+
return self._source_asset
160165

161166
# Avoid circular import
162167
Asset = self.asset.__class__ # noqa
@@ -183,7 +188,9 @@ def get_source(self) -> Union['Asset', None]:
183188
):
184189
return None
185190

186-
return source_asset
191+
self._source_asset = source_asset
192+
193+
return self._source_asset
187194

188195
@property
189196
def md5_hash(self):

kpi/serializers/v2/asset.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# coding: utf-8
22
import json
3+
import re
34

45
from django.conf import settings
56
from django.utils.translation import gettext as t
@@ -8,6 +9,7 @@
89
from rest_framework.reverse import reverse
910
from rest_framework.utils.serializer_helpers import ReturnList
1011

12+
from kobo.apps.reports.constants import FUZZY_VERSION_PATTERN
1113
from kobo.apps.reports.report_data import build_formpack
1214
from kpi.constants import (
1315
ASSET_STATUS_DISCOVERABLE,
@@ -498,18 +500,32 @@ def validate_data_sharing(self, data_sharing: dict) -> dict:
498500
else:
499501
asset = self.instance
500502
fields = data_sharing['fields']
501-
form_pack, _unused = build_formpack(asset, submission_stream=[])
503+
# We used to get all fields for every version for valid fields,
504+
# but the UI shows the latest version only, so only its fields
505+
# can be picked up. It is easier then to compare valid fields with
506+
# user's choice.
507+
form_pack, _unused = build_formpack(
508+
asset, submission_stream=[], use_all_form_versions=False
509+
)
510+
# We do not want to include the version field.
511+
# See `_infer_version_id()` in `kobo.apps.reports.report_data.build_formpack`
512+
# for field name alternatives.
502513
valid_fields = [
503514
f.path for f in form_pack.get_fields_for_versions(
504515
form_pack.versions.keys()
505-
)
516+
) if not re.match(FUZZY_VERSION_PATTERN, f.path)
506517
]
507518
unknown_fields = set(fields) - set(valid_fields)
508519
if unknown_fields and valid_fields:
509520
errors['fields'] = t(
510521
'Some fields are invalid, '
511522
'choices are: `{valid_fields}`'
512523
).format(valid_fields='`,`'.join(valid_fields))
524+
525+
# Force `fields` to be an empty list to avoid useless parsing when
526+
# fetching external xml endpoint (i.e.: /api/v2/assets/<asset_uid>/paired-data/<paired_data_uid>/external.xml)
527+
if sorted(valid_fields) == sorted(fields):
528+
data_sharing['fields'] = []
513529
else:
514530
data_sharing['fields'] = []
515531

kpi/serializers/v2/paired_data.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from rest_framework import serializers
77
from rest_framework.reverse import reverse
88

9+
from kobo.apps.reports.constants import FUZZY_VERSION_PATTERN
910
from kobo.apps.reports.report_data import build_formpack
1011
from kpi.constants import (
1112
ASSET_TYPE_SURVEY,
@@ -124,11 +125,20 @@ def _validate_fields(self, attrs: dict):
124125
return
125126

126127
source = attrs['source']
127-
form_pack, _unused = build_formpack(source, submission_stream=[])
128+
# We used to get all fields for every version for valid fields,
129+
# but the UI shows the latest version only, so only its fields
130+
# can be picked up. It is easier then to compare valid fields with
131+
# user's choice.
132+
form_pack, _unused = build_formpack(
133+
source, submission_stream=[], use_all_form_versions=False
134+
)
135+
# We do not want to include the version field.
136+
# See `_infer_version_id()` in `kobo.apps.reports.report_data.build_formpack`
137+
# for field name alternatives.
128138
valid_fields = [
129139
f.path for f in form_pack.get_fields_for_versions(
130140
form_pack.versions.keys()
131-
)
141+
) if not re.match(FUZZY_VERSION_PATTERN, f.path)
132142
]
133143

134144
source_fields = source.data_sharing.get('fields') or valid_fields
@@ -145,6 +155,11 @@ def _validate_fields(self, attrs: dict):
145155
}
146156
)
147157

158+
# Force `posted_fields` to be an empty list to avoid useless parsing when
159+
# fetching external xml endpoint (i.e.: /api/v2/assets/<asset_uid>/paired-data/<paired_data_uid>/external.xml)
160+
if sorted(valid_fields) == sorted(posted_fields):
161+
posted_fields = []
162+
148163
attrs['fields'] = posted_fields
149164

150165
def _validate_filename(self, attrs: dict):

kpi/tests/test_paired_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def test_cannot_retrieve_source_if_source_stop_data_sharing(self):
125125
self.assertEqual(source, self.source_asset)
126126
self.source_asset.data_sharing['enabled'] = False
127127
self.source_asset.save()
128-
source = self.paired_data.get_source()
128+
source = self.paired_data.get_source(force=True)
129129
self.assertEqual(source, None)
130130

131131
def test_cannot_retrieve_source_if_source_remove_perm(self):
@@ -134,5 +134,5 @@ def test_cannot_retrieve_source_if_source_remove_perm(self):
134134
self.source_asset.remove_perm(
135135
self.paired_data.asset.owner, PERM_VIEW_SUBMISSIONS
136136
)
137-
source = self.paired_data.get_source()
137+
source = self.paired_data.get_source(force=True)
138138
self.assertEqual(source, None)

kpi/utils/xml.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,26 @@
33
from typing import Optional, Union, List
44
from lxml import etree
55

6+
from django_request_cache import cache_for_request
7+
from shortuuid import ShortUUID
8+
69

710
def strip_nodes(
811
source: Union[str, bytes],
912
nodes_to_keep: list,
1013
use_xpath: bool = False,
1114
xml_declaration: bool = False,
1215
rename_root_node_to: Optional[str] = None,
16+
bulk_action_cache_key: str = None,
1317
) -> str:
1418
"""
1519
Returns a stripped version of `source`. It keeps only nodes provided in
1620
`nodes_to_keep`.
1721
If `rename_root_node_to` is provided, the root node will be renamed to the
1822
value of that parameter in the returned XML string.
23+
24+
A random string can be passed to `bulk_action_cache_key` to get the
25+
XPaths only once if calling `strip_nodes()` several times in a loop.
1926
"""
2027
# Force `source` to be bytes in case it contains an XML declaration
2128
# `etree` does not support strings with xml declarations.
@@ -28,7 +35,13 @@ def strip_nodes(
2835
root_element = tree.getroot()
2936
root_path = tree.getpath(root_element)
3037

31-
def get_xpath_matches():
38+
# `@cache_for_request` uses the parameters of the function it decorates
39+
# to generate the key under which the returned value of the function is
40+
# stored for cache purpose.
41+
# `cache_key` is only there to serve that purpose and ensure
42+
# `@cache_for_request` uniqueness.
43+
@cache_for_request
44+
def get_xpath_matches(cache_key: str):
3245
if use_xpath:
3346
xpaths_ = []
3447
for xpath_ in nodes_to_keep:
@@ -118,7 +131,14 @@ def remove_root_path(path_: str) -> str:
118131
return path_.replace(root_path, '')
119132

120133
if len(nodes_to_keep):
121-
xpath_matches = get_xpath_matches()
134+
# Always sends an unique string to `get_xpath_matches()`
135+
# See comments above the function
136+
if bulk_action_cache_key is None:
137+
cache_key = ShortUUID().random(24)
138+
else:
139+
cache_key = bulk_action_cache_key
140+
141+
xpath_matches = get_xpath_matches(cache_key=cache_key)
122142
process_node(root_element, xpath_matches)
123143

124144
if rename_root_node_to:

kpi/views/v2/paired_data.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from rest_framework.decorators import action
88
from rest_framework.response import Response
99
from rest_framework_extensions.mixins import NestedViewSetMixin
10+
from shortuuid import ShortUUID
1011

1112
from kpi.constants import SUBMISSION_FORMAT_TYPE_XML
1213
from kpi.models import Asset, AssetFile, PairedData
@@ -252,6 +253,8 @@ def external(self, request, paired_data_uid, **kwargs):
252253
format_type=SUBMISSION_FORMAT_TYPE_XML
253254
)
254255
parsed_submissions = []
256+
allowed_fields = paired_data.allowed_fields
257+
random_uuid = ShortUUID().random(24)
255258

256259
for submission in submissions:
257260
# Use `rename_root_node_to='data'` to rename the root node of each
@@ -263,9 +266,10 @@ def external(self, request, paired_data_uid, **kwargs):
263266
parsed_submissions.append(
264267
strip_nodes(
265268
submission,
266-
paired_data.allowed_fields,
269+
allowed_fields,
267270
use_xpath=True,
268271
rename_root_node_to='data',
272+
bulk_action_cache_key=random_uuid,
269273
)
270274
)
271275

0 commit comments

Comments
 (0)