import os import cv2 import numpy as np import pandas as pd import pytesseract from PIL import Image, ImageDraw, ImageFont, ImageGrab tesseract_cmd = r".\tesseract-ocr\tesseract.exe" for root, dirs, files in os.walk("labimage/"): for file in files: imgaetype = 1 image = Image.open(root + "/" + file) if image.size[0] == 4032: image = image.rotate(-90) imgaetype = 2 content = pytesseract.image_to_data(image, lang="chi_sim12", output_type="dict") for i in range(len(content["text"])): if 0 < len(content["text"][i]): if content["text"][i] == "姓名" or content["text"][i] == "姓": (x, y, w, h) = ( content["left"][i], content["top"][i], content["width"][i], content["height"][i], ) print(x, y, w, h) if imgaetype == 1: box = (x - 10, y - 10, x + w + 400, y + h + 30) else: box = (x - 5, y - 5, x + w + 400, y + h + 10) img = image.crop(box) content = pytesseract.image_to_string( img, lang="chi_sim43", output_type="dict" ) if content["text"] == "": filename = "./Cache/" + file img.save(filename) img = cv2.imread(filename) content = pytesseract.image_to_string( img, lang="chi_sim43", output_type="dict" ) elif content["text"][0] != "姓": filename = "./Cache/" + file img.save(filename) img = cv2.imread(filename) content = pytesseract.image_to_string( img, lang="chi_sim12", output_type="dict" ) cnt = content["text"] cnt = cnt.replace(" ", "") cnt = cnt[3:] print(cnt) image.paste((0, 0, 0), box) image.save("./Output/" + file) data = pd.DataFrame({"name": [cnt], "dir": [root + "/" + file]}) data.to_csv("name2file.csv", mode="a", header=False) break