-
Notifications
You must be signed in to change notification settings - Fork 1
/
stages.py
427 lines (372 loc) · 20.1 KB
/
stages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI
from langchain.prompts import load_prompt, PipelinePromptTemplate, PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains import MapReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.combine_documents.reduce import ReduceDocumentsChain
from langchain.callbacks import get_openai_callback
from openai.error import InvalidRequestError, RateLimitError
import tiktoken
# Summarize articles and get the result from OpenAI using Map-Reduce method
def summarize_article(apikey: str, content: str, ):
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
token_count = len(encoding.encode(content))
llm = ChatOpenAI(temperature=0, openai_api_key=apikey, model = 'gpt-3.5-turbo', max_tokens=1248)
if token_count < 1248:
# chunk size of template was calculated (3072-1600)
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1248, chunk_overlap=0, model_name='gpt-3.5-turbo'
)
# Create Document object for the text
docs = [Document(page_content=content)]
split_docs = text_splitter.split_documents(docs)
else:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
chunk_size=11088, chunk_overlap=0, model_name='gpt-3.5-turbo-16k',
)
if token_count >= 11088:
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=11088, chunk_overlap=0, model_name='gpt-3.5-turbo-16k',
)
# Create Document object for the text
docs = [Document(page_content=content)]
split_docs = text_splitter.split_documents(docs)
llm = ChatOpenAI(temperature=0, openai_api_key=apikey, model = 'gpt-3.5-turbo-16k', max_tokens=3696)
# Map
map_prompt = load_prompt("./prompts/summarize-map.yaml")
map_chain = LLMChain(llm=llm, prompt=map_prompt)
# Run chain
reduce_prompt = load_prompt("./prompts/summarize-reduce.yaml")
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="doc_summaries",
# reduce_k_below_max_tokens=True,
)
# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=13333,
)
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="docs",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)
try:
with get_openai_callback() as cb:
if len(split_docs) == 1:
summary = combine_documents_chain.run(split_docs)
return [summary, cb]
else:
summary = map_reduce_chain.run(split_docs)
return [summary, cb]
except InvalidRequestError as er:
raise InvalidRequestError(f"model: {llm.model_name}\ntoken_count: {token_count}", er.param)
def categorize(apikey: str, primaries: list[str], secondaries: list[str]):
primary = ""
for i, title in enumerate(primaries):
primary += f"{i+1} {title}\n"
secondary = ""
for title in secondaries:
secondary += f"- {title}\n"
prompt = load_prompt("./prompts/categorize.yaml")
# print(prompt.format(primary_titles=primary, secondary_titles=secondary))
llm = ChatOpenAI(temperature=0, openai_api_key=apikey, model = 'gpt-3.5-turbo-16k')
chain = LLMChain(llm=llm, prompt=prompt)
example = """[{"Primary": "Trump Indicted for Espionage", "Secondary": ["Trump Indicted for Espionage", "Trump faces criminal charges", "Trump Arrested on Classified Documents Charges"], "Title": [Trump under investigation]}, ...]"""
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
token_count = len(encoding.encode(primary+secondary))
print(primary)
print(secondary)
print(token_count)
with get_openai_callback() as cb:
result: str = chain.run(primary_titles=primary, secondary_titles=secondary, example=example)
result = result.split(']}]')[0] + ']}]'
return [result, cb]
def extra_research(apikey: str, articles: list[str]):
content = "\n".join(articles)
original_prompt = load_prompt("./prompts/extra-research.yaml")
full_template="""{{rules}}
Please adhere to the following structure for your examination
Output must be in this format. This must be valid python dictinoary or json object
Don't include " in the middle of result sentences in the output
Don't include ' in the middle of result sentences in the output
Don't include line breaking character (new line character) in the middle of result sentences in the output
###Output Format###
{"Introduction": "Summarize the articles' topic and content briefly. Highlight the key issues or events to be analyzed.", "Historical Context": "Discuss the events' history. Identify crucial factors, background details, and significant precedents relevant to the situation.", "Key Players": "Identify the primary individuals or organizations involved in the events. Explore their roles and their effects on the events.", "Underlying Motivations": "Examine the driving forces behind the actions in the articles. Dig into the goals, interests, or ideologies of the engaged parties, supporting your analysis with solid evidence or credible theories.", "Recent Developments": "Address any fresh updates or happenings related to the events.", "Impact": "Evaluate the immediate and long-term consequences of the events.", "Future Challenges": "Identify potential challenges that may arise from these events. Discuss potential implications and strategies to tackle them.", "Historical Comparisons": "Provide three instances from recent history that resonate with the events in the articles. Analyze each example, emphasizing similarities, differences, and lessons gleaned.", "Conclusion": "Wrap up your analysis by encapsulating the key findings or insights. Propose areas for further research, if applicable."}
"""
final_prompt = PromptTemplate.from_template(full_template, template_format="jinja2")
input_prompts = [
("rules", original_prompt),
]
prompt = PipelinePromptTemplate(final_prompt=final_prompt, pipeline_prompts=input_prompts)
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
token_count = len(encoding.encode(prompt.format(articles='')))
chunk_size = int((16000 - token_count) * 0.75)
max_token = int((16000 - token_count) * 0.25)
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=0, model_name='gpt-3.5-turbo-16k',
)
# Create Document object for the text
docs = [Document(page_content=content)]
split_docs = text_splitter.split_documents(docs)
llm = ChatOpenAI(temperature=0, openai_api_key=apikey, model = 'gpt-3.5-turbo-16k', max_tokens=max_token)
# Map
map_chain = LLMChain(llm=llm, prompt=prompt)
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=prompt)
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="articles",
# reduce_k_below_max_tokens=True,
)
# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=13333,
)
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="articles",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)
with get_openai_callback() as cb:
if len(split_docs) == 1:
summary = combine_documents_chain.run(split_docs)
return [summary, cb]
else:
summary = map_reduce_chain.run(split_docs)
return [summary, cb]
def deep_research(apikey: str, articles: list[dict[str:str]], background: dict):
content = "\n".join([f"Title: {article['title']}\nContent: {article['content']}" for article in articles])
background = "\n".join([f"{p}: {v}\n" for p, v in background.items()])
original_prompt = load_prompt("./prompts/deep-research.yaml")
full_template="""{{rules}}
Output must follow this format
Output must be in this format.
Don't include " in the middle of result sentences in output
Output format must follow this format
Output must be json object
This must be valid python dictinoary or json object
###Output Format to follow###
{"1 day timeframe": {"Most likely": {"Description": "Most likely", "Explanation": "Explanation"}, "Possible": {"Description": "Possible", "Explanation": "Explanation"}, "Unlikely": {"Description": "Unlikely", "Explanation": "Explanation"}}, "1 week timeframe": {"Most likely": {"Description": "Most likely", "Explanation": "Explanation"}, "Possible": {"Description": "Possible", "Explanation": "Explanation"}, "Unlikely": {"Description": "Unlikely", "Explanation": "Explanation"}}, "1 month timeframe": {"Most likely": {"Description": "Most likely", "Explanation": "Explanation"}, "Possible": {"Description": "Possible", "Explanation": "Explanation"}, "Unlikely": {"Description": "Unlikely", "Explanation": "Explanation"}}}
###additional background, context, and examples###
{{background}}
"""
final_prompt = PromptTemplate.from_template(full_template, template_format="jinja2")
background_prompt = PromptTemplate.from_template(background, template_format="jinja2")
input_prompts = [
("rules", original_prompt),
("background", background_prompt),
]
prompt = PipelinePromptTemplate(final_prompt=final_prompt, pipeline_prompts=input_prompts)
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
token_count = len(encoding.encode(prompt.format(articles='')))
chunk_size = int((16000 - token_count) * 0.75)
max_token = int((16000 - token_count) * 0.25)
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=0, model_name='gpt-3.5-turbo-16k',
)
# Create Document object for the text
docs = [Document(page_content=content)]
split_docs = text_splitter.split_documents(docs)
llm = ChatOpenAI(temperature=0, openai_api_key=apikey, model = 'gpt-3.5-turbo-16k', max_tokens=max_token)
# Map
map_chain = LLMChain(llm=llm, prompt=prompt)
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=prompt)
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="articles",
# reduce_k_below_max_tokens=True,
)
# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=13333,
)
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="articles",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)
with get_openai_callback() as cb:
if len(split_docs) == 1:
summary = combine_documents_chain.run(split_docs)
return [summary, cb]
else:
summary = map_reduce_chain.run(split_docs)
return [summary, cb]
def impactul_news(apikey: str, articles: list[dict]):
content = "\n".join([f"Title: {article['title']}\nSummary: {article['summary']}" for article in articles])
original_prompt = load_prompt("./prompts/impactful-news.yaml")
full_template="""{{rules}}
Output must follow this format
Output must be in this format. This must be python dictinoary or json object
Don't include " in the middle of result sentences
###Output Format###
[{"title": title, "explanation": Explanation}, {"title": title, "explanation": Explanation}, ... {"title": title, "explanation": Explanation}]
"""
final_prompt = PromptTemplate.from_template(full_template, template_format="jinja2")
input_prompts = [
("rules", original_prompt),
]
prompt = PipelinePromptTemplate(final_prompt=final_prompt, pipeline_prompts=input_prompts)
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
token_count = len(encoding.encode(prompt.format(articles='')))
chunk_size = int((16000 - token_count) * 0.75)
max_token = int((16000 - token_count) * 0.25)
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=0, model_name='gpt-3.5-turbo-16k',
)
# Create Document object for the text
docs = [Document(page_content=content)]
split_docs = text_splitter.split_documents(docs)
llm = ChatOpenAI(temperature=0, openai_api_key=apikey, model = 'gpt-3.5-turbo-16k', max_tokens=max_token)
# Map
map_chain = LLMChain(llm=llm, prompt=prompt)
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=prompt)
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="articles",
# reduce_k_below_max_tokens=True,
)
# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=13333,
)
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="articles",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)
with get_openai_callback() as cb:
if len(split_docs) == 1:
summary = combine_documents_chain.run(split_docs)
return [summary, cb]
else:
summary = map_reduce_chain.run(split_docs)
return [summary, cb]
def prediction(apikey: str, topics: list[dict], category: str, timeframe: str):
content = "\n".join([f"topic: {topic['topic']}\nprediction: {topic['prediction']}" for topic in topics])
main_prompt = load_prompt("./prompts/prediction.yaml")
category_prompt = PromptTemplate.from_template(category, template_format="jinja2")
timeframe_prompt = PromptTemplate.from_template(timeframe, template_format="jinja2")
time = "on a 1-day time frame"
if timeframe == 'week':
time = 'on a 1-week time frame'
if timeframe == 'month':
time = 'on a 1-month time frame'
full_template="""
Imagine you are a professional news analyst and journalist
###Task###
Let's think step by step.
Describe what you believe to be the next biggest development or emerging trend in the {{category}} category of the news based on the predictions and summaries to the most relevant news topics {{timeframe}}.
{{main}}
#######
Output must follow this format
Output must be in this format. This must be valid python dictinoary or json object
Don't include " in the middle of result sentences
The output for Explanation should explain how the developing trend came to be as well as describe in detail the potential connections, ripple effects, etc of the developing trend
###Output Format###
{"Developing Trend 1": Developing_Trend_1, "Explanation": Explanation, "Opportunities that may arise": Opportunities_that_may_arise, "Potential Pitfalls": Potential_Pitfalls}}
"""
final_prompt = PromptTemplate.from_template(full_template, template_format="jinja2")
input_prompts = [
("main", main_prompt),
("category", category_prompt),
("timeframe", timeframe_prompt),
]
prompt = PipelinePromptTemplate(final_prompt=final_prompt, pipeline_prompts=input_prompts)
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
token_count = len(encoding.encode(prompt.format(topics='', category=category, timeframe=time)))
chunk_size = int((16000 - token_count) * 0.75)
max_token = int((16000 - token_count) * 0.25)
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=0, model_name='gpt-3.5-turbo-16k',
)
# Create Document object for the text
docs = [Document(page_content=content)]
split_docs = text_splitter.split_documents(docs)
llm = ChatOpenAI(temperature=0, openai_api_key=apikey, model = 'gpt-3.5-turbo-16k', max_tokens=max_token)
# Map
map_chain = LLMChain(llm=llm, prompt=prompt)
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=prompt)
# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="topics",
# reduce_k_below_max_tokens=True,
)
# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=13333,
)
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="topics",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)
with get_openai_callback() as cb:
if len(split_docs) == 1:
summary = combine_documents_chain.run(split_docs)
return [summary, cb]
else:
summary = map_reduce_chain.run(split_docs)
return [summary, cb]