pFad - Phone/Frame/Anonymizer/Declutterfier! Saves Data!


--- a PPN by Garber Painting Akron. With Image Size Reduction included!

URL: http://github.com/EuroPython/programapi/commit/fc44122072f6c1a2b06d294bd66bd89ef8b136e6

s" /> add basic download/transform implementations · EuroPython/programapi@fc44122 · GitHub
Skip to content

Commit fc44122

Browse files
committed
add basic download/transform implementations
1 parent e7e46a0 commit fc44122

File tree

4 files changed

+249
-13
lines changed

4 files changed

+249
-13
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ COPY src/ ./src/
99
COPY Makefile .
1010

1111

12-
CMD ["make", "update"]
12+
CMD ["make", "all"]

Makefile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ deps/install:
1010

1111
install: deps/install
1212

13-
update:
14-
mkdir -p data/
15-
python src/save.py
16-
1713
download:
1814
cd src && python download.py
15+
16+
transform:
17+
cd src && python transform.py
18+
19+
20+
all: download transform

src/download.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ class Config:
2121
base_url = f"https://pretalx.com/api/events/{Config.event}/"
2222

2323
resources = [
24-
"submissions",
25-
"speakers",
24+
# Questions needs to be passed to include answers in the same endpoint,
25+
# saving us later time with joining the answers.
26+
"submissions?questions=all",
27+
"speakers?questions=all",
2628
]
2729

2830
for resource in resources:
@@ -40,9 +42,9 @@ class Config:
4042
data = response.json()
4143
res0 += data["results"]
4244

43-
fnames = [
44-
f"../data/raw/{Config.event}/{resource}_latest.json",
45-
]
46-
for fname in fnames:
47-
with open(fname, "w") as fd:
48-
json.dump(res0, fd)
45+
filename = resource.split("?")[0] # To get rid of "?questions"
46+
filename = f"{filename}_latest.json"
47+
filepath = f"../data/raw/{Config.event}/{filename}"
48+
49+
with open(filepath, "w") as fd:
50+
json.dump(res0, fd)

src/transform.py

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
import json
2+
import os
3+
from collections import defaultdict
4+
from datetime import date, datetime, time, timedelta
5+
6+
from pydantic import BaseModel
7+
from pydantic.class_validators import root_validator
8+
from slugify import slugify
9+
10+
11+
class SpeakerQuestion:
12+
affiliation = "Company / Organization / Educational Institution"
13+
homepage = "Social (Homepage)"
14+
twitter = "Social (X/Twitter)"
15+
mastodon = "Social (Mastodon)"
16+
17+
18+
class SubmissionQuestion:
19+
outline = "Outline"
20+
tweet = "Abstract as a tweet / toot"
21+
22+
23+
class SubmissionState:
24+
accepted = "accepted"
25+
confirmed = "confirmed"
26+
withdrawn = "withdrawn"
27+
28+
29+
class PretalxAnswer(BaseModel):
30+
question_text: str
31+
answer_text: str
32+
answer_file: str | None
33+
submission_id: str | None
34+
speaker_id: str | None
35+
36+
@root_validator(pre=True)
37+
def extract(cls, values):
38+
values["question_text"] = values["question"]["question"]["en"]
39+
values["answer_text"] = values["answer"]
40+
values["answer_file"] = values["answer_file"]
41+
values["submission_id"] = values["submission"]
42+
values["speaker_id"] = values["person"]
43+
return values
44+
45+
46+
class PretalxSpeaker(BaseModel):
47+
code: str
48+
name: str
49+
biography: str | None
50+
avatar: str | None
51+
slug: str
52+
answers: list[PretalxAnswer]
53+
submissions: list[str]
54+
55+
# Extracted
56+
affiliation: str | None = None
57+
homepage: str | None = None
58+
twitter: str | None = None
59+
mastodon: str | None = None
60+
61+
@root_validator(pre=True)
62+
def extract(cls, values):
63+
values["slug"] = slugify(values["name"])
64+
65+
answers = [PretalxAnswer.parse_obj(ans) for ans in values["answers"]]
66+
67+
for answer in answers:
68+
if answer.question_text == SpeakerQuestion.affiliation:
69+
values["affiliation"] = answer.answer_text
70+
71+
if answer.question_text == SpeakerQuestion.homepage:
72+
values["homepage"] = answer.answer_text
73+
74+
# NOTE: in practice the format of the data here is different,
75+
# depending on the speaker. We could fix this here by parsing the
76+
# the answer_text to some standardised format (either @handle or
77+
# https://twitter.com/handle url, etc)
78+
if answer.question_text == SpeakerQuestion.twitter:
79+
values["twitter"] = answer.answer_text
80+
81+
if answer.question_text == SpeakerQuestion.mastodon:
82+
values["mastodon"] = answer.answer_text
83+
84+
# Remove all the other answers
85+
# This is important, because some answers might contain non-public
86+
# information
87+
values["answers"] = []
88+
89+
return values
90+
91+
92+
class PretalxSubmission(BaseModel):
93+
code: str
94+
title: str
95+
speakers: list[str] # We only want the code, not the full info
96+
submission_type: str
97+
slug: str
98+
track: str | None
99+
state: str
100+
abstract: str
101+
answers: list[PretalxAnswer]
102+
tweet: str = ""
103+
outline: str
104+
duration: str
105+
106+
level: str = ""
107+
delivery: str | None = ""
108+
109+
# This is embedding a slot inside a submission for easier lookup later
110+
room: str | None = None
111+
start: datetime | None = None
112+
end: datetime | None = None
113+
114+
# TODO: once we have schedule data then we can prefill those in the code
115+
# here
116+
talks_in_parallel: list[str] | None = None
117+
talks_after: list[str] | None = None
118+
next_talk_code: str | None = None
119+
prev_talk_code: str | None = None
120+
121+
website_url: str | None = None
122+
123+
@root_validator(pre=True)
124+
def extract(cls, values):
125+
# # SubmissionType and Track have localised names. For this project we
126+
# # only care about their english versions, so we can extract them here
127+
for field in ["submission_type", "track"]:
128+
if values[field] is None:
129+
continue
130+
else:
131+
# In 2024 some of those are localised, and some are not.
132+
# Instead of figuring out why and fixing the data, there's this
133+
# hack:
134+
if isinstance(values[field], dict):
135+
values[field] = values[field]["en"]
136+
137+
values["speakers"] = {s["code"] for s in values["speakers"]}
138+
139+
answers = [PretalxAnswer.parse_obj(ans) for ans in values["answers"]]
140+
141+
for answer in answers:
142+
if answer.question_text == SubmissionQuestion.outline:
143+
values["outline"] = answer.answer_text
144+
if answer.question_text == SubmissionQuestion.tweet:
145+
values["tweet"] = answer.answer_text
146+
147+
# TODO if we need any other questions
148+
149+
slug = slugify(values["title"])
150+
values["slug"] = slug
151+
values["website_url"] = f"https://ep2024.europython.eu/session/{slug}"
152+
153+
# Remove all the other answers
154+
# This is important, because some answers might contain non-public
155+
# information
156+
values["answers"] = []
157+
158+
return values
159+
160+
@property
161+
def is_accepted(self):
162+
return self.state == SubmissionState.accepted
163+
164+
@property
165+
def is_confirmed(self):
166+
return self.state == SubmissionState.confirmed
167+
168+
@property
169+
def is_publishable(self):
170+
return self.is_accepted or self.is_confirmed
171+
172+
173+
def parse_submissions() -> list[PretalxSubmission]:
174+
"""
175+
Returns only confirmed talks
176+
"""
177+
with open("../data/raw/europython-2024/submissions_latest.json") as fd:
178+
js = json.load(fd)
179+
subs = []
180+
for item in js:
181+
sub = PretalxSubmission.parse_obj(item)
182+
subs.append(sub)
183+
184+
return subs
185+
186+
187+
def parse_speakers() -> list[PretalxSpeaker]:
188+
"""
189+
Returns only speakers with confirmed talks
190+
"""
191+
with open("../data/raw/europython-2024/speakers_latest.json") as fd:
192+
js = json.load(fd)
193+
speakers = []
194+
for item in js:
195+
speaker = PretalxSpeaker.parse_obj(item)
196+
speakers.append(speaker)
197+
198+
return speakers
199+
200+
201+
def publishable_submissions() -> dict[str, PretalxSubmission]:
202+
return {s.code: s for s in parse_submissions() if s.is_publishable}
203+
204+
205+
def publishable_speakers(accepted_proposals: set[str]) -> dict[str, PretalxSpeaker]:
206+
sp = parse_speakers()
207+
output = {}
208+
for speaker in sp:
209+
accepted = set(speaker.submissions) & accepted_proposals
210+
if accepted:
211+
# Overwrite with only the accepted proposals
212+
speaker.submissions = list(accepted)
213+
output[speaker.code] = speaker
214+
215+
return output
216+
217+
218+
print(len(parse_submissions()))
219+
print(len(accepted := publishable_submissions()))
220+
221+
print(len(parse_speakers()))
222+
print(len(publishable_speakers(accepted.keys())))
223+
224+
print(publishable_speakers(accepted.keys()))
225+
226+
227+
from pprint import pprint
228+
229+
pprint(accepted)
230+
231+
# Check if all the slugs are unique
232+
assert len(set(s.slug for s in accepted.values())) == len(accepted)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad © 2024 Your Company Name. All rights reserved.





Check this box to remove all script contents from the fetched content.



Check this box to remove all images from the fetched content.


Check this box to remove all CSS styles from the fetched content.


Check this box to keep images inefficiently compressed and original size.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy