-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclasses.py
408 lines (355 loc) · 16.7 KB
/
classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
import os
import asyncio
import json
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from openai import OpenAI
import pandas as pd
import PyPDF2
from secret import openai_key
class PDFScraper():
def __init__(self, verbose):
self.verbose = verbose
self.client = OpenAI(api_key=openai_key)
return
async def get_pdf_links_from_url(self, root_url, deep=False, url_depth=1):
"""
Crawls a root_url for all pdf links.
With deep=True, crawling occurs on found links, with a depth of url_depth.
"""
browser_config = BrowserConfig(
browser_type='chromium',
headless=True,
verbose=self.verbose
)
deep_crawl_strategy = None
if deep:
scorer = KeywordRelevanceScorer(
keywords=["annual", "financial", "report", "pdf", "download", "investor", "publication"],
weight=0.7
)
deep_crawl_strategy = BestFirstCrawlingStrategy(
max_depth=url_depth,
url_scorer=scorer,
include_external=False
)
crawl_config = CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=root_url,
config=crawl_config,
)
if type(result) is list:
pdfs = []
for r in result:
for l in r.links.get("internal", []):
if '.pdf' in l['href']:
pdfs.append(l)
for l in r.links.get("external", []):
if '.pdf' in l['href']:
pdfs.append(l)
else:
pdfs = []
for l in result.links.get("internal", []):
if '.pdf' in l['href']:
pdfs.append(l)
for l in result.links.get("external", []):
if '.pdf' in l['href']:
pdfs.append(l)
if self.verbose:
print(pdfs)
return pd.DataFrame(pdfs)
def find_annual_reports_from_pdf_links(self, links, special_instructions):
system_prompt = f"""
You are a utility which picks out the relevant annual reports from a list of URLs.
"""
prompt = f"""
Get the annual reports from a list of candidate PDF URLs. There must be only one
annual report per year. Are are looking for the one with financial statements.
Make sure the year is correct. If a date is present in a URL or filename, it
might be an upload date. To resolve this, note that the annual report is always
uploaded after the year its for.
Sort the output by year descending.
If there are missing years between the newest and oldest year, there's a high
likelihood that data isn't missing. Try to ensure those gaps are filled.
Special instructions: {special_instructions}
Dataset: {links}
"""
schema = {
"type": "object",
"properties": {
"annual_reports": {
"type": "array",
"items": {
"type": "object",
"properties": {
"year": {
"type": "integer",
"description": "The fiscal year of the report."
},
"filename": {
"type": "string",
"description": "The name of the PDF file."
},
"url": {
"type": "string",
"description": "The URL linking to the annual report PDF."
}
},
"required": ["year", "filename", "url"],
"additionalProperties": False
}
}
},
"required": ["annual_reports"],
"additionalProperties": False
}
if self.verbose:
print("Querying ChatGPT to find annual reports from PDF links...")
response = self.client.responses.create(
model="o3-mini",
input=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt}
],
text={
"format": {
"type": "json_schema",
"name": "annual_report",
"schema": schema,
"strict": True
}
}
)
annual_reports = pd.DataFrame(json.loads(response.output_text)['annual_reports']).set_index('year')
if self.verbose:
print(annual_reports)
suspicious_years = []
for year in range(1980, 2025):
if (str(year) in json.dumps(links)) and (not year in annual_reports.index.values):
suspicious_years.append(year)
if len(suspicious_years) > 0:
print(f"Warning: years found in links without annual reports: {suspicious_years}")
return annual_reports
class DataParser():
def __init__(self, verbose):
self.verbose = verbose
self.client = OpenAI(api_key=openai_key)
return
def parse_report_for_year(self, report_path, report_name, year, special_instructions=""):
pdf_path = f"{report_path}\\{report_name}"
filesize = os.path.getsize(pdf_path)/1024**2
final_dict = None
if self.verbose:
print(f"Processing {pdf_path} for {year}")
expected_balance_sheet = [
"total_assets", "current_assets", "non_current_assets",
"total_liabilities", "current_liabilities", "non_current_liabilities",
"total_shareholders_equity", "share_capital", "retained_earnings"
]
expected_income_statement = [
"total_revenue", "total_expenses", "net_gains", "net_losses",
"earnings_per_share"
]
expected_cash_flow_statement = [
"cash_and_cash_equivalents_at_beginning_of_period",
"cash_and_cash_equivalents_at_end_of_period",
"cash_from_operations", "cash_from_investing", "cash_from_financing",
"net_change_in_cash", "free_cash_flow"
]
with open(pdf_path, "rb") as infile:
reader = PyPDF2.PdfReader(infile)
num_pages = len(reader.pages)
# Split too large PDFs
if filesize > 32 or num_pages > 100:
print(f"File {pdf_path} too large for a single request. Splitting the PDF.")
balance_sheet_pages = []
income_statement_pages = []
cash_flow_statement_pages = []
for i in range(num_pages):
page = reader.pages[i]
text = page.extract_text()
if text and 'balance sheet' in text.lower():
for j in range(i, min(i + 3, num_pages)):
if j not in balance_sheet_pages:
balance_sheet_pages.append(j)
if text and 'income statement' in text.lower():
for j in range(i, min(i + 3, num_pages)):
if j not in income_statement_pages:
income_statement_pages.append(j)
if text and 'cash flow statement' in text.lower():
for j in range(i, min(i + 3, num_pages)):
if j not in income_statement_pages:
cash_flow_statement_pages.append(j)
balance_sheet_pdf = f"{report_path}\\_balance_sheet_{report_name}"
income_statement_pdf = f"{report_path}\\_income_statement_{report_name}"
cash_flow_statement_pdf = f"{report_path}\\_cash_flow_statement_{report_name}"
self.save_reduced_pdf(pdf_path, balance_sheet_pdf, balance_sheet_pages)
self.save_reduced_pdf(pdf_path, income_statement_pdf, income_statement_pages)
self.save_reduced_pdf(pdf_path, cash_flow_statement_pdf, cash_flow_statement_pages)
balance_sheet_dict = self.parse_reduced_file(balance_sheet_pdf, 'balance sheet', year, special_instructions, expected_balance_sheet)
income_statement_dict = self.parse_reduced_file(income_statement_pdf, 'income statement', year, special_instructions, expected_income_statement)
cash_flow_statement_dict = self.parse_reduced_file(cash_flow_statement_pdf, 'cash flow statement', year, special_instructions, expected_cash_flow_statement)
final_dict = {
'year': year,
'balance_sheet': balance_sheet_dict,
'income_statement': income_statement_dict,
'cash_flow_statement': cash_flow_statement_dict
}
# Parse a small PDF
else:
if self.verbose:
print(f"Parsing {report_name} for all data.")
print(f"Uploading to openai...")
openai_file = self.client.files.create(
file=open(pdf_path, "rb"),
purpose="user_data"
)
if self.verbose:
print(f"Data parsing with openai...")
instruction = f"""
From the given annual report, extract the balance sheet, income statement, and
cash flow statement data for the current year. The current year is {year}.
Your response should simply be a JSON dictionary, ready to load into json.loads (no headers).
The top level dict keys must be balance_sheet, income_statement, and
cash_flow_statement. The values for these keys is another dict, where the items
are the relevant data you find. Do not nest another dictionary beyond this
level. Make sure there are no duplicate keys. I would like dict keys in
lowercase_with_underscores.
I am expecting the following keys for the balance sheet: {expected_balance_sheet}
I am expecting the following keys for the income statement: {expected_income_statement}
I am expecting the following keys for the cash flow statement: {expected_cash_flow_statement}
These are the minimum required sets, additional keys are expected. Include all data you can,
while maintaining accuracy in data categorization.
Special instructions: {special_instructions}
"""
response = self.client.responses.create(
model="o1",
input=[
{
"role": "user",
"content": [
{
"type": "input_file",
"file_id": openai_file.id,
},
{
"type": "input_text",
"text": instruction,
},
]
}
]
)
output = json.loads(response.output_text)
if self.verbose:
print(f"Result:")
print(output)
final_dict = {'year':year}
final_dict.update(output)
if final_dict:
if self.verbose:
print(f"Saving result...")
with open(f"{report_path}\\output.json", 'w') as f:
json.dump(final_dict, f)
def parse_reduced_file(self, file, find_str, year, special_instructions, expected_columns):
if self.verbose:
print(f"Parsing {file} for {find_str}.")
print(f"Uploading {find_str} to openai...")
openai_file = self.client.files.create(
file=open(file, "rb"),
purpose="user_data"
)
if self.verbose:
print(f"Data parsing with openai...")
instruction = f"""
From the given annual report, extract the {find_str} data for the current
year. The current year is {year}.
Your response should simply be a JSON dictionary, ready to load into json.loads (no headers).
Dict items are the relevant data you find. Make sure there are no duplicate
keys. Do not give a nested dictionary. I would like dict keys in
lowercase_with_underscores.
I am expecting the following keys at minimum: {expected_columns}
Additional keys are expected. Include all data you can, while maintaining accuracy in data categorization.
Special instructions: {special_instructions}
"""
response = self.client.responses.create(
model="o1",
input=[
{
"role": "user",
"content": [
{
"type": "input_file",
"file_id": openai_file.id,
},
{
"type": "input_text",
"text": instruction,
},
]
}
]
)
try:
output = json.loads(response.output_text)
if self.verbose:
print(f"Result:")
print(output)
return output
except Exception as e:
print(f"Failed to extract {find_str} from {file}. AI response:")
print(f"{response.output_text}")
print(e)
return {}
def save_reduced_pdf(self, input_pdf_path, output_pdf_path, page_numbers):
"""
Create a reduced PDF from the input_pdf_path containing only the pages listed in page_numbers.
Parameters:
input_pdf_path (str): Path to the original PDF.
output_pdf_path (str): Path where the reduced PDF will be saved.
page_numbers (list): A list of page numbers (0-indexed) to include in the new PDF.
"""
with open(input_pdf_path, "rb") as infile:
reader = PyPDF2.PdfReader(infile)
writer = PyPDF2.PdfWriter()
# Iterate through the list of page numbers
for page_num in page_numbers:
# Check if the page number is within the range of available pages
if page_num < len(reader.pages):
writer.add_page(reader.pages[page_num])
else:
print(f"Page {page_num + 1} is out of range. Skipping.")
# Save the reduced PDF
with open(output_pdf_path, "wb") as outfile:
writer.write(outfile)
print(f"Reduced PDF saved as: {output_pdf_path}")
def consolidate_reports(self, data, data_str, special_instructions, model):
instruction = f"""
Consolidate the given data.
Each item in the given list is a {data_str} for a given year.
The keys need to be consolidated across the years.
You may leave some keys as None if there is no matching data for a given year.
However, try your best to leave as little data gaps as possible. If there is a
reported figure for a given year, it is likely in all years, or subsequent
years. For example, "assets" may change to "total_assets", but these should be
merged. There should be no nesting of dictionaries; the data for a given year
should be numbers.
Use your knowledge about the common items occurring in a {data_str}.
Your output should be json.loads ready (no headers).
Special instructions:
{special_instructions}
Data:
{data}
"""
if self.verbose:
print(f"{data_str}.")
response = self.client.responses.create(
model=model,
input=instruction
)
result = pd.DataFrame(json.loads(response.output_text)).set_index('year')
if self.verbose:
print(result)
return result