Skip to content

Commit 6b13b26

Browse files
skyeslatterymateow99
authored andcommitted
switch scraper process to be on a different docker container
1 parent c4788cb commit 6b13b26

File tree

4 files changed

+93
-82
lines changed

4 files changed

+93
-82
lines changed

app.py

Lines changed: 1 addition & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,9 @@
22
import argparse
33
from flask import Flask
44
from flask_graphql import GraphQLView
5-
from flask_apscheduler import APScheduler
65
from graphene import Schema
76
from src.schema import Query, Mutation
8-
from src.scrapers.games_scraper import fetch_game_schedule
9-
from src.scrapers.youtube_stats import fetch_videos
107
from src.utils.team_loader import TeamLoader
11-
import time
128
import os
139
import signal
1410
import sys
@@ -18,10 +14,6 @@
1814

1915
app = Flask(__name__)
2016

21-
# Set up the scheduler
22-
scheduler = APScheduler()
23-
scheduler.init_app(app)
24-
2517
# Configure logging
2618
logging.basicConfig(
2719
format="%(asctime)s %(levelname)-8s %(message)s",
@@ -40,53 +32,11 @@ def create_context():
4032
"/graphql", view_func=GraphQLView.as_view("graphql", schema=schema, graphiql=True, get_context=create_context)
4133
)
4234

43-
# Setup command line arguments
44-
def parse_args():
45-
parser = argparse.ArgumentParser(description="Skip scraping tasks, for dev purposes.")
46-
parser.add_argument(
47-
"--no-scrape",
48-
action="store_true",
49-
help="Skips scraping tasks if set, useful for frontend development.",
50-
)
51-
return parser.parse_args()
52-
53-
5435
def signal_handler(sig, frame):
55-
"""Handle Ctrl+C by shutting down scheduler and exiting"""
56-
scheduler.shutdown()
5736
sys.exit(0)
5837

5938
signal.signal(signal.SIGINT, signal_handler)
6039
signal.signal(signal.SIGTERM, signal_handler)
6140

6241
if __name__ == "__main__":
63-
args = parse_args()
64-
65-
if not args.no_scrape:
66-
# scrapes games every 5 minutes.
67-
# need testing to see if this can be lower safely
68-
@scheduler.task("interval", id="scrape_schedules", seconds=300)
69-
def scrape_schedules():
70-
start_time = time.time()
71-
logging.info("Starting scraping games")
72-
fetch_game_schedule()
73-
elapsed_time = time.time() - start_time
74-
logging.info(f"Completed scraping games in {elapsed_time:.2f} seconds")
75-
76-
@scheduler.task("interval", id="scrape_videos", seconds=43200)
77-
def scrape_videos():
78-
logging.info("Scraping YouTube videos")
79-
fetch_videos()
80-
81-
scheduler.start()
82-
83-
scrape_schedules()
84-
scrape_videos()
85-
86-
87-
try:
88-
debug = os.getenv("STAGE") != "prod"
89-
app.run(debug=True, host="0.0.0.0", port=8000)
90-
except KeyboardInterrupt:
91-
scheduler.shutdown()
92-
sys.exit(0)
42+
app.run(debug=True, host="0.0.0.0", port=8000)

docker-compose.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,10 @@ services:
88
- "8000:8000"
99
volumes:
1010
- ./ca-certificate.crt:/etc/ssl/ca-certificate.crt:ro # Mount MongoDB cert inside the container, ro for read only
11+
12+
scraper:
13+
image: cornellappdev/score-dev:${IMAGE_TAG}
14+
env_file: .env
15+
command: python scraper.py
16+
volumes:
17+
- ./ca-certificate.crt:/etc/ssl/ca-certificate.crt:ro

scraper.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import logging
2+
import time
3+
from flask import Flask
4+
from flask_apscheduler import APScheduler
5+
import signal
6+
import sys
7+
from src.scrapers.games_scraper import fetch_game_schedule
8+
from src.scrapers.youtube_stats import fetch_videos
9+
10+
app = Flask(__name__)
11+
12+
# Set up the scheduler
13+
scheduler = APScheduler()
14+
scheduler.init_app(app)
15+
16+
# Configure logging
17+
logging.basicConfig(
18+
format="%(asctime)s %(levelname)-8s %(message)s",
19+
level=logging.INFO,
20+
datefmt="%Y-%m-%d %H:%M:%S",
21+
)
22+
23+
@scheduler.task("interval", id="scrape_schedules", seconds=300)
24+
def scrape_schedules():
25+
start_time = time.time()
26+
logging.info("Starting scraping games")
27+
fetch_game_schedule()
28+
elapsed_time = time.time() - start_time
29+
logging.info(f"Completed scraping games in {elapsed_time:.2f} seconds")
30+
31+
@scheduler.task("interval", id="scrape_videos", seconds=43200)
32+
def scrape_videos():
33+
logging.info("Scraping YouTube videos")
34+
fetch_videos()
35+
36+
def signal_handler(sig, frame):
37+
scheduler.shutdown()
38+
sys.exit(0)
39+
40+
signal.signal(signal.SIGINT, signal_handler)
41+
signal.signal(signal.SIGTERM, signal_handler)
42+
43+
if __name__ == "__main__":
44+
scheduler.start()
45+
scrape_schedules()
46+
scrape_videos()
47+
app.run(host="0.0.0.0", port=8001)

src/utils/helpers.py

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import requests
23
from PIL import Image
34
from io import BytesIO
@@ -16,35 +17,41 @@ def get_dominant_color(image_url, white_threshold=200, black_threshold=50):
1617
Returns:
1718
color: The hex code of the dominant color.
1819
"""
19-
response = requests.get(image_url)
20-
image = Image.open(BytesIO(response.content)).convert("RGBA")
21-
22-
image = image.resize((50, 50))
23-
image = image.quantize(colors=5).convert("RGBA")
24-
pixels = image.getdata()
25-
26-
filtered_pixels = [
27-
pixel
28-
for pixel in pixels
29-
if not (
30-
pixel[0] > white_threshold
31-
and pixel[1] > white_threshold
32-
and pixel[2] > white_threshold
20+
default_color = "#000000"
21+
22+
try:
23+
response = requests.get(image_url)
24+
image = Image.open(BytesIO(response.content)).convert("RGBA")
25+
26+
image = image.resize((50, 50))
27+
image = image.quantize(colors=5).convert("RGBA")
28+
pixels = image.getdata()
29+
30+
filtered_pixels = [
31+
pixel
32+
for pixel in pixels
33+
if not (
34+
pixel[0] > white_threshold
35+
and pixel[1] > white_threshold
36+
and pixel[2] > white_threshold
37+
)
38+
and not (
39+
pixel[0] < black_threshold
40+
and pixel[1] < black_threshold
41+
and pixel[2] < black_threshold
42+
)
43+
]
44+
45+
if filtered_pixels:
46+
pixel_count = Counter(filtered_pixels)
47+
dominant_color = pixel_count.most_common(1)[0][0]
48+
else:
49+
dominant_color = (0, 0, 0)
50+
51+
hex_color = "#{:02x}{:02x}{:02x}".format(
52+
dominant_color[0], dominant_color[1], dominant_color[2]
3353
)
34-
and not (
35-
pixel[0] < black_threshold
36-
and pixel[1] < black_threshold
37-
and pixel[2] < black_threshold
38-
)
39-
]
40-
41-
if filtered_pixels:
42-
pixel_count = Counter(filtered_pixels)
43-
dominant_color = pixel_count.most_common(1)[0][0]
44-
else:
45-
dominant_color = (0, 0, 0)
46-
47-
hex_color = "#{:02x}{:02x}{:02x}".format(
48-
dominant_color[0], dominant_color[1], dominant_color[2]
49-
)
50-
return hex_color
54+
return hex_color
55+
except Exception as e:
56+
logging.error(f"Error in get_dominant_color for {image_url}: {e}")
57+
return default_color

0 commit comments

Comments
 (0)