pytesseract测试

发布于 2020-07-02  870 次阅读


from PIL import Image
#from itertools import cycle
import os, random
import pytesseract
config = "--psm 8 --oem 0 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz"
def tesOCR(img):
    return pytesseract.image_to_string(img, lang='eng', config=config)


class Fileset(list):
    def __init__(self, name,  ext='', _read=None, root=None):
        if isinstance(name, str)  :
            self.root = os.path.join(root or os.getcwd(), name)
            self.extend(f for f in os.listdir(self.root) if f.endswith(ext))
            self._read = _read
    def __getitem__(self, index):
        if isinstance(index, int):# index是索引
            return os.path.join(self.root, super().__getitem__(index))
        else:# index是切片
            fileset = Fileset(None)
            fileset.root = self.root
            fileset._read = self._read
            fileset.extend(super().__getitem__(index))
            return fileset
    def getFileName(self, index):
        fname, ext = os.path.splitext(super().__getitem__(index))
        return fname
    def __iter__(self):
        return (os.path.join(self.root, f) for f in super().__iter__())
    def __call__(self):
        retn = random.choice(self)
        if self._read: return self._read(retn)
        else: return retn

sample = Fileset('Captcha', '.jpg', Image.open)

import time
t = time.time()
for i in range(100):
    a = sample()
    #x = tesOCR(a)
    y = tesOCR(a.convert("L"))
    #if x!=y:
    #display(a)
    #print(y)
        #display(a.convert("L"))
        #print(x,y)
print(time.time() - t)
#耗时23s
测试用验证码文件
与百度OCR的对比,第一个是tesseractOCR
#tesOCR.py
from PIL import Image
import pytesseract
from io import BytesIO

config = "--psm 8 --oem 0 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz"

def tesOCR(imgdata):
    img = Image.open(BytesIO(imgdata))
    return pytesseract.image_to_string(img.convert("L"), lang='eng', config=config)
#baiduOCR.py
from aip import AipOcr
""" 你的 APPID AK SK """
APP_ID = ''
API_KEY = ''
SECRET_KEY = ''

client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
options = {}
options["language_type"] = "ENG"
options["detect_direction"] = "true"
options["detect_language"] = "true"

def BaiduOCR(image, options=options):
    if isinstance(image, str):
        results = client.basicGeneralUrl(image, options)
    else:
        results = client.basicGeneral(image, options)
    if 'error_code' in results:
        print (f'BaiduOCR error {results["error_code"]} {results["error_msg"]}')
        return ''
    return ''.join(line['words'] for line in results['words_result'] if 'words' in line)
#test.py
import os, random
class Fileset(list):
    def __init__(self, name,  ext='', _read=None, root=None):
        if isinstance(name, str)  :
            self.root = os.path.join(root or os.getcwd(), name)
            self.extend(f for f in os.listdir(self.root) if f.endswith(ext))
            self._read = _read
    def __getitem__(self, index):
        if isinstance(index, int):# index是索引
            return os.path.join(self.root, super().__getitem__(index))
        else:# index是切片
            fileset = Fileset(None)
            fileset.root = self.root
            fileset._read = self._read
            fileset.extend(super().__getitem__(index))
            return fileset
    def getFileName(self, index):
        fname, ext = os.path.splitext(super().__getitem__(index))
        return fname
    def __iter__(self):
        return (os.path.join(self.root, f) for f in super().__iter__())
    def __call__(self):
        retn = random.choice(self)
        if self._read: return self._read(retn)
        else: return retn
def fopen(path):
    with open(path, 'rb') as f:
        return f.read()
sample = Fileset('Captcha', '.jpg', fopen)

OCR = input('请选择验证码识别方式(默认为tesseract, 1为百度OCR):')
if not OCR: from tesOCR import tesOCR as OCR
elif OCR == "1" : from baiduOCR import BaiduOCR as OCR
from baiduOCR import BaiduOCR

for i in range(10):
    a = sample()
    print(OCR(a))
    print(BaiduOCR(a))

医学生