/
pngToMP3.py
87 lines (71 loc) · 3.33 KB
/
pngToMP3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#! /usr/bin/python3
# import the necessary packages
from PIL import Image
import pytesseract
import argparse
import cv2
import os
from gtts import gTTS
def getText(fileToConvert, thresh=True, blur=False):
"""getText function loads the image on the path passed to the function, converts it to
greyscale, and extracts text using pytesseract package (package is an interface with tesseract
software which must be installed separately). If thresh arguement default is set to true, and
should be used if the loaded image needs to be sharpened. If the blur arguement default is set
to false, and should be set to true if the passed image requires moise reduction to prevent
interference with character recognition"""
# load the example image and convert it to grayscale
image = cv2.imread(fileToConvert)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# check to see if we should apply thresholding to preprocess the image
if thresh:
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# make a check to see if median blurring should be done to remove noise
elif blur:
gray = cv2.medianBlur(gray, 3)
# write the grayscale image to disk as a temporary file so we can apply OCR to it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)
# load the image as a PIL/Pillow image, apply OCR, and then delete the temporary file
mytext = pytesseract.image_to_string(Image.open(filename))
os.remove(filename)
print(mytext)
# show the output images
cv2.imshow("Image", image)
cv2.imshow("Output", gray)
return mytext
def convertToSpeech(mytext, outputTitle, playAfter):
"""convertToSpeech function receives text to be converted to audio file, the output filename, and the option to play
the file after the conversion. Line 62 system call should reflect the appropriate command to run the preferred playback
software (default is mpg123)"""
# Language in which you want to convert
language = 'en'
# Passing the text and language to the engine, here we have marked slow=False. Which tells the module that the converted audio should have a high speed
myobj = gTTS(text=mytext, lang=language, slow=False)
# Saving the converted audio in a mp3 file named from user input above
myobj.save("{}.mp3".format(outputTitle))
# Playing the converted file
if playAfter:
os.system("mpg123 {}.mp3".format(outputTitle))
def main():
compiledText = ""
prep = input("Preprocess? (T) Threshold (B) Blur (N) None ")
outputTitle = input("Output filename? ")
play = input("Play file after compiling? Y/N ")
if play.lower() == "y":
playAfter = True
else:
playAfter = False
if outputTitle[-4:] == ".mp3":
outputTitle = outputTitle[:-4]
prep = prep.lower()
for root, dirs, files in os.walk("."):
for filename in files:
if prep == 't':
compiledText += str(" " + getText(filename))
elif prep == 'b':
compiledText += str(" " + getText(filename, thresh=False, blur=True))
elif prep == 'n':
compiledText += str(" " + getText(filename, thresh=False, blur=False))
convertToSpeech(compiledText, outputTitle, playAfter)
if __name__ == "__main__":
main()