雖然在前面在第二次升級時就已經通過json配置文件支持將ocr識別結果txt保存到指定的文件夾裡了,但由於指定待識別文件夾時文件夾裡面可能包含多個不同的子文件夾、不同的子文件夾裡面可能包含同名的圖像文件,而原來的方式是直接把所有的txt全部放在json文件指定的一個文件夾中,當不同文件夾中存在同名的圖像文件時,會存在txt文件覆蓋的情況,雖然幾率很小但是開發小伙伴們反映確實出現了這種情況,那麼就需要改進,最好是改進為下面的方式:在指定的文件夾下面按照原圖像文件的目錄結構新建相同的文件夾結構並存放txt文件,也即在json指定的txt保存路徑下重新按照待識別圖像文件夾的結構,完全重新新建相同的文件夾結果,這樣就可以完全避免由於大文件夾中的文件重名而帶來的識別結果txt文件覆蓋從而丟失的情況發生了,升級後的代碼如下所示:
#!/home/super/miniconda3/bin/python
#encoding=utf-8
#author: superchao1982, [email protected]
#幫助信息
strhelp='''
img2txt is one program to get ocr texts from image or pdf files!
default threshold is 0.1;
default langpath is '/home/langdata' for linux and 'C:\ocr\langdata' for win;
default remove char is '| _^~`&';
default path storing the ocr texts are the same directory with images;
default settings above can be changed in the file 'config.json' which stored in langpath;
contents in config.json like:
{
"threshold": 0.1,
"batchsize": 2,
"workernum": 4,
"maximgsize": 1000,
"allowlist": "",
"langpath": "/home/langdata",
"removechar": " _^~`&"
"txtpath": ""
}
------------------------------------
e.g.
./img2txt.py img1.jpg jmg2.jpg #follow by one or more image files
./img2txt.py ./img1 ./img home/usr/Document/img #follow by one or more directory contain image files
./img2txt.py --help #output the help info
./img2txt.py --config #generate the default config.json file in the langpath
------------------------------------
'''
import sys
import json
import os
import pdf2image
import numpy as np
#------------------默認參數設置----------------------
threshold=0.1 #(default = 0.1)阈值
batchsize=2 # (default = 1) - batch_size>1 will make EasyOCR faster but use more memory
workernum=4 # (default = 0) - Number thread used in of dataloader
maximgsize=1000 #(default = 1000) - Max image width & height when using pdf
allowlist='' # (string) - Force EasyOCR to recognize only subset of characters
removechar='| _^~`&'#待刪除無效字符
txtpath='' #ocr識別後同名txt文件存放的位置:空表示同一目錄,點表示相對目錄,其他表示絕對目錄
#根據系統設置默認的語言包路徑
if sys.platform.lower().startswith('linux'):
langpath='/home/langdata'
elif sys.platform.lower().startswith('win'):
langpath='C:\ocr\langdata'
else:
print('Error: Unknow System!')
sys.exit()
#配置參數字典
config={
"threshold": threshold,
"batchsize": batchsize,
"workernum": workernum,
"maximgsize": maximgsize,
"allowlist": allowlist,
"langpath": langpath,
"removechar": removechar,
"txtpath": txtpath
}
#------------------命令行參數處理----------------------
#首先對輸入的命令行參數進行處理,在加載ocr包之前排查的好處是避免臨處理時出錯白白浪費時間
for i in range(1,len(sys.argv)):#獲取命令行參數:argv[0]表示可執行文件本身
if sys.argv[i] in ['-h', '--help']:
print(strhelp)
sys.exit()
elif sys.argv[i] in ['-c', '--config']:
#保存字典到文件
try:
with open(os.path.join(langpath,'config.json'), 'w') as jsonfile:
json.dump(config, jsonfile, ensure_ascii=False,indent=4)
print('Genrerating config.json success! ---> ', os.path.join(langpath,'config.json'))
except(Exception) as e:
print('\tSaving config file config.json Error: ', e)#輸出異常錯誤
sys.exit()
else:
#check the image file or directory is valid-提前校驗,免得浪費時間加載easyocr模型
if not os.path.exists(sys.argv[i]):
print(sys.argv[i], ' is invalid, please input the correct file or directory path!')
sys.exit()
#檢查語言包路徑是否正確check the langpath is valid
if not os.path.exists(langpath):
print('Error: Invalid langpath! Checking the path again!')
sys.exit()
#判斷是否存在配置文件config.json,存在就使用,格式如下:
configfile=os.path.join(langpath,'config.json')
if os.path.exists(configfile):
try:
with open(configfile, 'r') as jsonfile:
configdict=json.load(jsonfile)
threshold=configdict['threshold']
batchsize=configdict['batchsize']
workernum=configdict['workernum']
maximgsize=configdict['maximgsize']
langpath=configdict['langpath']
allowlist=configdict['allowlist']
removechar=configdict['removechar']
txtpath=configdict['txtpath']
print('using the config in ', configfile)
except(Exception) as e:
print('\tReading config file ', configfile ,' Error: ', e)#輸出異常錯誤
print('\tCheck the json file, or remove the config.json file to use defaulting configs!')
sys.exit()
else:
print('\tusing the default config in ', langpath)
print(configdict)
#如果用戶在config.json中指定的txt文件保存路徑不存在就生成一個
if len(txtpath)>0 and not os.path.exists(txtpath):
print('txtpath in config.json is not exists, generating ', txtpath, '!\n')
try:
os.system('mkdir '+txtpath)
except(Exception) as e:
print('\tMaking txt directory Error: ', e)#輸出異常錯誤
print('\tPlease input a legal txtpath in the config.json file and try again!\n')
sys.exit()
#------------------開始OCR識別----------------------
import easyocr
ocrreader=easyocr.Reader(['ch_sim', 'en'], model_storage_directory=langpath)#Linux: r'/home/langdata', Windows: r'C:\ocr\langdata'
for ind in range(1,len(sys.argv)):#獲取命令行參數:argv[0]表示可執行文件本身
argpath=sys.argv[ind]
#如果是文件...
if os.path.isfile(argpath):
paper=''
#獲取文件後綴名
filext=os.path.splitext(argpath)[-1]
if filext.upper() not in ['.JPG','.JPEG','.PNG','.BMP','.PDF']:#轉換為大寫後再比對
print('\t', argpath, ' 不是有效圖片格式(jpg/jpeg/png/bmp/pdf)!')
continue
if filext.upper() in['.PDF']:#如果是pdf文檔
images=pdf2image.convert_from_path(argpath)#將pdf文檔轉換為圖像序列
for i in range(len(images)):#如果圖片尺寸過大,縮小到特定尺寸,避免內存崩潰
ratio=max(images[i].width, images[i].height)/maximgsize
if ratio>1:
images[i]=images[i].resize((round(images[i].width/ratio),round(images[i].height/ratio)))
result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum)
for w in result:
if w[2]>threshold:#設置一定的置信度阈值
paper = paper+w[1]
else:
result = ocrreader.readtext(argpath,batch_size=batchsize,workers=workernum)
for w in result:
if w[2]>threshold:#設置一定的置信度阈值
paper = paper+w[1]
#print(paper)
for item in removechar:
paper=paper.replace(item, '')
paper=paper.replace('\r', '')
paper=paper.replace('\n', '')
#記錄當前文件的識別結果,保存為同名的txt文件
if(len(txtpath)>0):#如果設置了txt文件目錄
basename=os.path.basename(argpath)+'.txt'#與原文件同名的txt文件(不含目錄僅文件名)
txtfilename=os.path.join(txtpath, basename)
else:
txtfilename=os.path.splitext(argpath)[0]+'.txt'#與原文件同名的txt文件(包括目錄)
print('saving file ---> ', txtfilename)#保存的文件名字
try:
with open(txtfilename, 'w') as txtfile:
txtfile.write(paper)
except(Exception) as e:
print('\t', txtfilename, ' Saving txt File Error: ', e)#輸出異常錯誤
continue
#如果是文件夾...
if os.path.isdir(argpath):
for root, _, filenames in os.walk(argpath):
for imgfile in filenames:
paper=''
filext=os.path.splitext(imgfile)[-1]#文件後綴名
if filext.upper() not in ['.JPG','.JPEG','.PNG','.BMP','.PDF']:
print('\t', imgfile, '的後綴名不是有效的圖像格式,跳過該文件!')
continue
imgfilepath=os.path.join(root, imgfile)#文件絕對路徑
if filext.upper() in['.PDF']:#如果是pdf文檔
images=pdf2image.convert_from_path(imgfilepath)#將pdf文檔轉換為圖像序列
for i in range(len(images)):#如果圖片尺寸過大,縮小到特定尺寸,避免內存崩潰
ratio=max(images[i].width, images[i].height)/maximgsize
if ratio>1:
images[i]=images[i].resize((round(images[i].width/ratio),round(images[i].height/ratio)))
result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum)
for w in result:
if w[2]>threshold:#設置一定的置信度阈值
paper = paper+w[1]
else:
result = ocrreader.readtext(imgfilepath,batch_size=batchsize,workers=workernum)
for w in result:
if w[2]>threshold:#設置一定的置信度阈值
paper = paper+w[1]
#print(paper)
for item in removechar:
paper=paper.replace(item, '')
paper=paper.replace('\r', '')
paper=paper.replace('\n', '')
#記錄當前文件的識別結果,保存為同名的txt文件
basename=os.path.splitext(imgfile)[0]+'.txt'#與原文件同名的txt文件(不包括目錄)
if(len(txtpath)>0):#如果設置了txt文件目錄
#原來的方式是直接把所有的txt全部放在指定的一個文件夾中,當不同文件夾中存在同名的圖像文件時,會存在txt文件覆蓋的情況
#txtfilename=os.path.join(txtpath, basename)#拼接得到txt文件的絕對路徑
#下面的方式是在指定的文件夾下面按照原圖像文件的目錄結構新建相同的文件夾結構並存放txt文件
relativeimgpath=imgfilepath.lstrip(argpath)#圖片絕對路徑左減去命令行指定的路徑argpath得到圖像文件的內部相對路徑
newtxtpath=os.path.join(txtpath,relativeimgpath)#指定txt文件路徑+圖像內部相對路徑(還帶有圖像文件名和後綴名)
basedir=os.path.dirname(newtxtpath)#去掉圖像文件名和後綴名
if not os.path.exists(basedir):#上面的新文件路徑不一定存在
try:
os.system('mkdir '+basedir)#新建文件夾
except(Exception) as e:
print('\tMaking txt directory Error: ', e)#輸出異常錯誤
print('\tTxt file will be storded in the image file directory!')
txtfilename=os.path.join(root, basename)#路徑+txt文件名
txtfilename=os.path.join(basedir, basename)#新路徑+txt文件名
else:
txtfilename=os.path.join(root, basename)#路徑+txt文件名
print('saving file ---> ', txtfilename)#保存的文件名字
try:
with open(txtfilename, 'w') as txtfile:
txtfile.write(paper)
except(Exception) as e:
print('\t', txtfilename, ' Saving txt File Error: ', e)#輸出異常錯誤
continue