Skip to content

Commit 55372cc

Browse files
ic
0 parents  commit 55372cc

File tree

7 files changed

+169
-0
lines changed

7 files changed

+169
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.DS_Store

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Airflow Plugin - Google analytics

__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from airflow.plugins_manager import AirflowPlugin
2+
from google_analytics_plugin.hooks.google_analytics_hook import GoogleAnalyticsHook
3+
from google_analytics_plugin.operators.google_analytics_reporting_to_s3_operator import GoogleAnalyticsReportingToS3Operator
4+
5+
class GoogleAnalyticsPlugin(AirflowPlugin):
6+
name = "google_analytics_plugin"
7+
hooks = [GoogleAnalyticsHook]
8+
operators = [GoogleAnalyticsReportingToS3Operator]
9+
executors = []
10+
macros = []
11+
admin_views = []
12+
flask_blueprints = []
13+
menu_links = []

hooks/__init__.py

Whitespace-only changes.

hooks/google_analytics_hook.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from airflow.hooks.base_hook import BaseHook
2+
3+
from apiclient.discovery import build
4+
from oauth2client.service_account import ServiceAccountCredentials
5+
6+
import time
7+
8+
class GoogleAnalyticsHook(BaseHook):
9+
def __init__(self, google_analytics_conn_id='google_analytics_default'):
10+
self.google_analytics_conn_id = google_analytics_conn_id
11+
self.connection = self.get_connection(google_analytics_conn_id)
12+
13+
self.client_secrets = self.connection.extra_dejson['client_secrets']
14+
15+
def get_service_object(self, api_name, api_version, scopes):
16+
credentials = ServiceAccountCredentials.from_json_keyfile_dict(self.client_secrets, scopes)
17+
return build(api_name, api_version, credentials=credentials)
18+
19+
def get_analytics_report(self, view_id, since, until, sampling_level, dimensions, metrics, page_size, include_empty_rows):
20+
analytics = self.get_service_object('analyticsreporting', 'v4', ['https://www.googleapis.com/auth/analytics.readonly'])
21+
22+
reportRequest = {
23+
'viewId': view_id,
24+
'dateRanges': [{ 'startDate': since, 'endDate': until }],
25+
'samplingLevel': sampling_level or 'LARGE',
26+
'dimensions': dimensions,
27+
'metrics': metrics,
28+
'pageSize': page_size or 100,
29+
'includeEmptyRows': include_empty_rows or False
30+
}
31+
32+
response = analytics.reports().batchGet(body={ 'reportRequests': [reportRequest] }).execute()
33+
34+
if response.get('reports'):
35+
report = response['reports'][0]
36+
rows = report.get('data', {}).get('rows', [])
37+
38+
while report.get('nextPageToken'):
39+
time.sleep(1)
40+
reportRequest.update({ 'pageToken': report['nextPageToken'] })
41+
response = analytics.reports().batchGet(body={ 'reportRequests': [reportRequest] }).execute()
42+
report = response['reports'][0]
43+
rows.extend(report.get('data', {}).get('rows', []))
44+
45+
if report['data']:
46+
report['data']['rows'] = rows
47+
48+
return report
49+
else:
50+
return {}

operators/__init__.py

Whitespace-only changes.
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from google_analytics_plugin.hooks.google_analytics_hook import GoogleAnalyticsHook
2+
3+
from airflow.hooks.S3_hook import S3Hook
4+
from airflow.models import BaseOperator
5+
6+
import hashlib
7+
import json
8+
import os
9+
from datetime import datetime
10+
11+
class GoogleAnalyticsReportingToS3Operator(BaseOperator):
12+
template_fields = ('s3_key', 'since', 'until')
13+
14+
def __init__(self,
15+
google_analytics_conn_id,
16+
view_id,
17+
since,
18+
until,
19+
sampling_level,
20+
dimensions,
21+
metrics,
22+
page_size,
23+
include_empty_rows,
24+
s3_conn_id,
25+
s3_bucket,
26+
s3_key,
27+
*args,
28+
**kwargs):
29+
super().__init__(*args, **kwargs)
30+
31+
self.google_analytics_conn_id = google_analytics_conn_id
32+
self.view_id = view_id
33+
self.since = since
34+
self.until = until
35+
self.sampling_level = sampling_level
36+
self.dimensions = dimensions
37+
self.metrics = metrics
38+
self.page_size = page_size
39+
self.include_empty_rows = include_empty_rows
40+
self.s3_conn_id = s3_conn_id
41+
self.s3_bucket = s3_bucket
42+
self.s3_key = s3_key
43+
44+
self.metricMap = {
45+
'METRIC_TYPE_UNSPECIFIED': 'varchar(255)',
46+
'CURRENCY': 'decimal(20,5)',
47+
'INTEGER': 'int(11)',
48+
'FLOAT': 'decimal(20,5)',
49+
'PERCENT': 'decimal(20,5)',
50+
'TIME': 'time'
51+
}
52+
53+
def execute(self, context):
54+
ga_conn = GoogleAnalyticsHook(self.google_analytics_conn_id)
55+
s3_conn = S3Hook(self.s3_conn_id)
56+
57+
# This has to be here because template_fields are not yet parsed in the __init__ function
58+
since_formatted = datetime.strptime(self.since, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
59+
until_formatted = datetime.strptime(self.until, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
60+
61+
report = ga_conn.get_analytics_report(self.view_id, since_formatted, until_formatted, self.sampling_level, self.dimensions, self.metrics, self.page_size, self.include_empty_rows)
62+
63+
columnHeader = report.get('columnHeader', {})
64+
# Right now all dimensions are hardcoded to varchar(255), will need a map if any non-varchar dimensions are used in the future
65+
# Unfortunately the API does not send back types for Dimensions like it does for Metrics (yet..)
66+
dimensionHeaders = [
67+
{ 'name': header.replace('ga:', ''), 'type': 'varchar(255)' }
68+
for header
69+
in columnHeader.get('dimensions', [])
70+
]
71+
metricHeaders = [
72+
{ 'name': entry.get('name').replace('ga:', ''), 'type': self.metricMap.get(entry.get('type'), 'varchar(255)') }
73+
for entry
74+
in columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
75+
]
76+
77+
file_name = '/tmp/{key}.jsonl'.format(key=self.s3_key)
78+
with open(file_name, 'w') as ga_file:
79+
rows = report.get('data', {}).get('rows', [])
80+
81+
for row_counter, row in enumerate(rows):
82+
root_data_obj = {}
83+
dimensions = row.get('dimensions', [])
84+
metrics = row.get('metrics', [])
85+
86+
for index, dimension in enumerate(dimensions):
87+
header = dimensionHeaders[index].get('name').lower()
88+
root_data_obj[header] = dimension
89+
90+
for metric in metrics:
91+
data = {}
92+
data.update(root_data_obj)
93+
94+
for index, value in enumerate(metric.get('values', [])):
95+
header = metricHeaders[index].get('name').lower()
96+
data[header] = value
97+
98+
data['viewid'] = self.view_id
99+
data['timestamp'] = self.since
100+
101+
ga_file.write(json.dumps(data) + ('' if row_counter == len(rows) else '\n'))
102+
103+
s3_conn.load_file(file_name, self.s3_key, self.s3_bucket, True)
104+
os.remove(file_name)

0 commit comments

Comments
 (0)