pytesseract
pip install pytesseract
pytesseract
pip install pytesseract
text recognition python library
import cv2
import pytesseract
img = cv2.imread('image.jpg')
# Adding custom options
custom_config = r'--oem 3 --psm 6'
pytesseract.image_to_string(img, config=custom_config)
tesseract python ocr
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
# If you don't have tesseract executable in your PATH, include the following:
pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
# Example tesseract_cmd = r'C:Program Files (x86)Tesseract-OCRtesseract'
# Simple image to string
print(pytesseract.image_to_string(Image.open('test.png')))
# List of available languages
print(pytesseract.get_languages(config=''))
# French text image to string
print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))
# In order to bypass the image conversions of pytesseract, just use relative or absolute image path
# NOTE: In this case you should provide tesseract supported images or tesseract will return error
print(pytesseract.image_to_string('test.png'))
# Batch processing with a single file containing the list of multiple image file paths
print(pytesseract.image_to_string('images.txt'))
# Timeout/terminate the tesseract job after a period of time
try:
print(pytesseract.image_to_string('test.jpg', timeout=2)) # Timeout after 2 seconds
print(pytesseract.image_to_string('test.jpg', timeout=0.5)) # Timeout after half a second
except RuntimeError as timeout_error:
# Tesseract processing is terminated
pass
# Get bounding box estimates
print(pytesseract.image_to_boxes(Image.open('test.png')))
# Get verbose data including boxes, confidences, line and page numbers
print(pytesseract.image_to_data(Image.open('test.png')))
# Get information about orientation and script detection
print(pytesseract.image_to_osd(Image.open('test.png')))
# Get a searchable PDF
pdf = pytesseract.image_to_pdf_or_hocr('test.png', extension='pdf')
with open('test.pdf', 'w+b') as f:
f.write(pdf) # pdf type is bytes by default
# Get HOCR output
hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr')
# Get ALTO XML output
xml = pytesseract.image_to_alto_xml('test.png')
ocr python
# https://github.com/mindee/doctr
from doctr.models import ocr_predictor
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
Copyright © 2021 Codeinu
Forgot your account's password or having trouble logging into your Account? Don't worry, we'll help you to get back your account. Enter your email address and we'll send you a recovery link to reset your password. If you are experiencing problems resetting your password contact us