/
get.py
34 lines (29 loc) · 1.08 KB
/
get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def extract_Text_pdf(book, passcode=None):
import PyPDF2
try:
with open(book, "rb") as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
if pdfReader.isEncrypted:
try:
if passcode is None:
pdfReader.decrypt(str(input("Password>")))
else:
pdfReader.decrypt(str(passcode))
except PyPDF2.utils.PdfReadError as e:
print(str(e))
else:
pass
extracted_text = " " #we'll append the extracted text here!
for a_pageNum in range(pdfReader.numPages):
pageObj = pdfReader.getPage(a_pageNum)
extracted_text += pageObj.extractText()
return extracted_text
except Exception as err:
print(str(err))
def extract_Text_docx(book):
import docx as d
doc = d.Document(book)
extracted_text = []
for a_paragraph in doc.paragraphs:
extracted_text.append(a_paragraph.text)
return '\n'.join(extracted_text)