main.py 1.92 KB
import os

import cv2
import numpy as np
import pandas as pd
import pytesseract
from PIL import Image, ImageDraw, ImageFont, ImageGrab

tesseract_cmd = r".\tesseract-ocr\tesseract.exe"

for root, dirs, files in os.walk("labimage/"):
    for file in files:
        image = Image.open(root + "/" + file)
        content = pytesseract.image_to_data(image, lang="chi_sim43", output_type="dict")
        for i in range(len(content["text"])):
            if 0 < len(content["text"][i]):
                if content["text"][i] == "姓名" or (
                    content["text"][i] == "姓" and content["text"][i + 1] == "名"
                ):
                    (x, y, w, h) = (
                        content["left"][i],
                        content["top"][i],
                        content["width"][i],
                        content["height"][i],
                    )
                    print(x, y, w, h)
                    img = image.crop((x - 10, y - 10, x + w + 400, y + h + 30))
                    content = pytesseract.image_to_string(
                        img, lang="chi_sim43", output_type="dict"
                    )
                    if content["text"] == "":
                        filename = "./Cache/" + file
                        img.save(filename)
                        img = cv2.imread(filename)
                        content = pytesseract.image_to_string(
                            img, lang="chi_sim43", output_type="dict"
                        )
                    cnt = content["text"]
                    cnt = cnt.replace(" ", "")
                    cnt = cnt[3:]
                    print(cnt)
                    image.paste((0, 0, 0), (x - 10, y - 10, x + w + 400, y + h + 30))
                    image.save("./Output/" + file)
                    data = pd.DataFrame({"name": [cnt], "dir": [root + "/" + file]})
                    data.to_csv("name2file.csv", mode="a", header=False)
                    break