-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc2text.py
127 lines (107 loc) · 2.99 KB
/
doc2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import getopt
import locale
import shutil
import sys
from itertools import cycle
from subprocess import Popen, PIPE
import docx
from docx.table import Table
from docx.text.paragraph import Paragraph
class Doc2TextError(Exception):
pass
def consist_table(file):
if '|' in file:
return True
return False
def smart_strip(cells):
cells[:] = [' '.join(cell.split()) for cell in cells]
return ';'.join(cells)
def parse_word_tables(file):
"""
parse doc tables
restrictions: all cells must be filled (no empty cells)
"""
lines = file.splitlines()
result = []
cells = []
for line in lines:
if not line:
continue
if line[0] != '|' and line[-1] != '|':
result.append(smart_strip(cells))
result.append(line)
else:
row_line = line.split('|')
is_new_line = all(not cell_line.isspace() for cell_line in row_line)
if is_new_line:
result.append(smart_strip(cells))
cells = row_line
else:
row_line_cycle = cycle(row_line)
cells[:] = [cell + next(row_line_cycle) for cell in cells]
return result
def paragraphs_tables(docx):
"""
merge tables and paragraphs together in docx
need it to keep order of text and tables of docx documents
"""
p_t_list = []
for content in docx._body._body.getchildren():
if content.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p':
p_t_list.append(Paragraph(content, docx._body));
elif content.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tbl':
p_t_list.append(Table(content, docx._body));
else:
print(content.tag);
return p_t_list
def doc2text(filename):
if filename[-5:] == '.docx':
text = []
document = docx.Document(filename)
for t in paragraphs_tables(document):
if type(t) is Table:
for r in t.rows:
sections = [' '.join(cell.text.split()) for cell in r.cells]
sections = [section.rstrip('.') for section in sections]
text.append('. '.join(sections))
else:
text.append(t.text)
return '\n'.join(text)
elif filename[-4:] == '.doc':
locale.setlocale(locale.LC_ALL, ('ru', 'utf8'))
antiword = shutil.which('antiword')
if antiword is None:
raise Doc2TextError('Antiword utility must be installed and added to PATH!')
cmd = [antiword, filename]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
result = stdout.decode('utf8')
if consist_table(result):
result = parse_word_tables(result)
return result
else:
raise Doc2TextError('Unknown document type')
def main(argv):
inputfile = ''
try:
opts, args = getopt.getopt(argv, "hi:", ["ifile=", ])
if len(opts) == 0:
print('doc2text.py -i <inputfile>')
return
for opt, arg in opts:
if opt == '-h':
print('doc2text.py -i <inputfile>')
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
if inputfile is not None:
text = doc2text(inputfile)
print(text)
except getopt.GetoptError:
print('doc2text.py -i <inputfile>')
sys.exit(2)
except Doc2TextError as e:
print(e.args[0])
sys.exit(2)
if __name__ == "__main__":
main(sys.argv[1:])