/
webGenAPI.py
30 lines (24 loc) · 1.02 KB
/
webGenAPI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
import urllib
import videoGenerationAPI
import html2text
import requests
import nltk
from bs4 import BeautifulSoup
def generateVideoFromURL(url, outpufilename, language, sex):
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for i, chunk in enumerate(chunks) if chunk and i < 40)
videoGenerationAPI.generateSentenceVideo(text, "output", language, sex)
if __name__ == "__main__":
generateVideoFromURL("https://en.wikipedia.org/wiki/User_talk:169.232.218.22", "output", "1", "1")