/
zhihu_answer_wordcloud.py
89 lines (81 loc) · 3.15 KB
/
zhihu_answer_wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import html2text
import requests
import re
import jieba
from wordcloud import WordCloud
import os
# sudo apt-get install python3-tk
'''
@author: github/wolverinn
@date: 01/20/2019
requirements:
Python 3.x
requests
html2text
jieba
wordcloud
chineseStopWords.txt
if not Windows, please change the directory of the fonts
function:
Input a zhihu question URL, get 150 answers, save them to markdown, and generates wordcloud.
待添加功能:输入答案URL,保存答案到markdown
TODO: Input the URL of an answer, save it to markdown
'''
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
"Host": "www.zhihu.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
}
question = []
def get_flow():
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.ignore_emphasis = True
h.ignore_tables = True
filename = "{}.txt".format(question_id)
f = open(filename,'w',encoding='utf-8')
for i in range(15):
limit = "10"
offset = str(i*10)
print("getting the {} answer...".format(offset))
api_base_url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Ccontent%2Ceditable_content%2Cvoteup_count&limit={}&offset={}&platform=desktop&sort_by=default".format(question_id,limit,offset)
answer_flow = requests.get(api_base_url,headers=headers)
if answer_flow.status_code == 200:
answer_data = answer_flow.json()["data"]
question.append(answer_data[0]["question"]["title"])
answer_count = answer_flow.json()["paging"]["totals"]
if int(answer_count) < int(offset) + 10:
return 0
for single_answer in answer_data:
single_content = single_answer["content"]
single_vote = single_answer["voteup_count"]
converted_content = h.handle(single_content)
f.write(converted_content)
f.write('\n')
f.close()
def seg_answer():
answer_content = open(question_id + ".txt",'r',encoding='utf-8').read()
print("segging answers...")
answer_seg = jieba.cut(answer_content)
stopwords = open("chineseStopWords.txt",'r',encoding='GBK').read()
new_answer_seg = []
for text in answer_seg:
if text not in stopwords:
new_answer_seg.append(text)
string = ' '.join(new_answer_seg)
print("making wordcloud...")
wc = WordCloud(font_path='C:/Windows/Fonts/simkai.ttf',background_color='white',width=1000,height=800).generate(string)
wc.to_file(question[0]+'.png')
question_url = input("输入知乎问题链接(如:https://www.zhihu.com/question/23819007):")
valid_check = re.search('question/\d*',question_url)
if valid_check:
question_id = valid_check.group().replace("question/",'')
get_flow()
print("required answers saved successfully")
seg_answer()
img_name = ".\\"+"\""+question[0]+".png"+"\""
os.system(img_name)
# print(question_id)
else:
print("Not a valid zhihu question URL !")