/
count_words_in_books_class.py
36 lines (28 loc) · 1.12 KB
/
count_words_in_books_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
import sys
def count_words_in_book(filename):
"""
"""
with open(filename,'r') as current_file:
text = current_file.read()
text = text.lower()
#get ride of punctuations
punctuation = [',','.','"',"'",";",':']
for punc in punctuation:
text = text.replace(punc,"")
text = text.replace("\n","") #replace newlines
text = text.split() #split long string into words
text_word_count_dict = {}
for word in text:
if word in text_word_count_dict:
text_word_count_dict[word] +=1
else:
text_word_count_dict[word] = 1
text_word_count_df = pd.DataFrame.from_dict(text_word_count_dict,orient = 'index',columns = ['word_count'])
text_word_count_df_sorted = text_word_count_df.sort_values(by = ['word_count'],ascending=False)
return(text_word_count_df_sorted)
#myfile = "../Gutenberg/Books_EngFr/English/shakespeare/Romeo and Juliet.txt"
#path to file
myfile = sys.argv[1]
book_df = count_words_in_book(myfile) #book_df is a dataframe of words sorted by count
print(book_df.iloc[0:10])