/
translations.py
103 lines (88 loc) · 3.14 KB
/
translations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from pdf2image import convert_from_path
from os import path, mkdir
import pickle
import io
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from apiclient.http import MediaFileUpload, MediaIoBaseDownload
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/drive.file']
def auth_drive():
"""Authenticate in the Drive and returns a usable service object
"""
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
service = build('drive', 'v3', credentials=creds)
return service
def split_pdf(file):
out = "orig_images"
if not path.exists(out):
mkdir(out)
pages = convert_from_path(file, dpi=300,output_file="",output_folder=out,first_page=2,fmt='jpeg',paths_only=True)
return pages
def upload_doc(service,file):
name = file.split("/")[1]
file_metadata = {
'name': name,
'mimeType': 'application/vnd.google-apps.file',
'parents': ['1my_JqnNBNak5-aIlt5TZ92IHOazWUU2G']
}
media = MediaFileUpload(file,
mimetype='image/jpeg',
resumable=True)
file_up = service.files().create(body=file_metadata,
media_body=media,
fields='id').execute()
return(file_up)
def download_doc(service,file_up,file_name):
out = "down_text"
if not path.exists(out):
mkdir(out)
request = service.files().export_media(fileId=file_up['id'],mimeType='text/plain')
fh = io.FileIO(file_name,"ab")
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print("Download %d%%." % int(status.progress() * 100))
service.files().delete(fileId=file_up['id']).execute()
def main():
file_name = "Schams - 1930 - Komparative Statik.pdf"
final_file = file_name.split(".")[0] + ".txt"
pages = split_pdf(file_name)
service = auth_drive()
for i,page in enumerate(pages):
file_up = upload_doc(service,page)
download_doc(service,file_up,final_file)
# with open("down_text/ocr_text.txt","a") as f:
# f.write("\n["+format(i+1)+"]\n")
if __name__ == '__main__':
main()
class Footnote(Phrase):
def __init__(self):
self.pos = None
class Phrase():
def __init__(self,phrase):
self.phrase = phrase
self.trad = ""
self.prev = None
self.next = None
self.page = None
self.footnote = []