Skip to content

Commit

Permalink
Support parse jobs posts
Browse files Browse the repository at this point in the history
  • Loading branch information
polyrabbit committed May 14, 2024
1 parent 35050b3 commit f4357ed
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 32 deletions.
55 changes: 24 additions & 31 deletions hacker_news/llm/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,45 +79,38 @@ def call_openai_family(content: str, sys_prompt: str) -> str:
# Gemma outputs weird words like Kün/viciss/▁purcha/▁xPos/▁Gorb
kwargs['logit_bias'] = {200507: -100, 225856: -100, 6204: -100, 232014: -100, 172406: -100}

if config.openai_model.startswith('text-'):
prompt = (f'Use third person mood to summarize the following article delimited by triple backticks in 2 concise English sentences. Ensure the summary does not exceed 100 characters.\n'
f'```{content.strip(".")}.```')
resp = openai.Completion.create(
prompt=prompt,
**kwargs
)
answer = resp['choices'][0]['text'].strip()
else:
resp = openai.ChatCompletion.create(
messages=[
{
"role": "system",
"content": sys_prompt
},
{'role': 'user', 'content': content},
],
**kwargs)
message = resp["choices"][0]["message"]
if message.get('function_call'):
json_str = message['function_call']['arguments']
if resp["choices"][0]['finish_reason'] == 'length':
json_str += '"}' # best effort to save truncated answers
try:
answer = json.loads(json_str)
except JSONDecodeError as e:
logger.warning(f'Failed to decode answer from openai, will fallback to plain text, error: {e}')
return '' # Let fallback code kicks in
else:
answer = message['content'].strip()
resp = openai.ChatCompletion.create(
messages=[
{
"role": "system",
"content": sys_prompt
},
{'role': 'user', 'content': content},
],
**kwargs)
logger.info(f'content: {content}')
logger.info(f'took {time.time() - start_time}s to generate: '
# Default str(resp) prints \u516c
f'{json.dumps(resp.to_dict_recursive(), sort_keys=True, indent=2, ensure_ascii=False)}')
if 'error' in resp:
raise Exception(f'error message: {resp["error"].get("message")}, code: {resp["error"].get("code")}')
message = resp["choices"][0]["message"]
if message.get('function_call'):
json_str = message['function_call']['arguments']
if resp["choices"][0]['finish_reason'] == 'length':
json_str += '"}' # best effort to save truncated answers
try:
answer = json.loads(json_str)
except JSONDecodeError as e:
logger.warning(f'Failed to decode answer from openai, will fallback to plain text, error: {e}')
return '' # Let fallback code kicks in
else:
answer = message['content'].strip()
# Gemma sometimes returns "**Summary:**\n\nXXX\n\n**Key points:**\n\nXXX", extract the summary part
for line in answer.split('\n'):
if not line.strip():
continue
if 'summary' in line.lower() and len(line) <= 100:
if 'summary' in line.lower() and line.strip()[-1] == ':':
continue
answer = line
break
Expand Down
6 changes: 5 additions & 1 deletion hacker_news/news.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ def get_score(self) -> int:
except:
return 0

def is_hiring_job(self) -> bool:
return self.get_score() == 0 and not self.author and 'YC ' in self.title

def slug(self):
return slugify(self.title or 'no title')

Expand Down Expand Up @@ -130,7 +133,8 @@ def summarize_by_openai(self, content):
if not openai.api_key:
logger.info("OpenAI API key is not set")
return ''
if self.get_score() < config.openai_score_threshold: # Avoid expensive openai
if (self.get_score() < config.openai_score_threshold # Avoid expensive openai
and not self.is_hiring_job()):
logger.info("Score %d is too small, ignore openai", self.get_score())
return ''

Expand Down
5 changes: 5 additions & 0 deletions test/test_hackernews_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from datetime import datetime, timedelta

from hacker_news.algolia_api import get_news
from hacker_news.news import News
from hacker_news.parser import HackerNewsParser


Expand Down Expand Up @@ -52,3 +53,7 @@ def test_algolia_api(self):
date = news_list[0].submit_time.date()
for news in news_list:
self.assertEqual(date, news.submit_time.date())

def test_maybe_jobs_post(self):
news = News(title='MixRank (YC S11) Is Hiring Software Engineers and Founders Globally')
self.assertTrue(news.is_hiring_job())

0 comments on commit f4357ed

Please sign in to comment.