Commit c70c0960 authored by EvilCalf's avatar EvilCalf

enhanced robustness

parent f2b919b5
This diff is collapsed.
This diff is collapsed.
...@@ -10,13 +10,15 @@ tesseract_cmd = r".\tesseract-ocr\tesseract.exe" ...@@ -10,13 +10,15 @@ tesseract_cmd = r".\tesseract-ocr\tesseract.exe"
for root, dirs, files in os.walk("labimage/"): for root, dirs, files in os.walk("labimage/"):
for file in files: for file in files:
imgaetype = 1
image = Image.open(root + "/" + file) image = Image.open(root + "/" + file)
content = pytesseract.image_to_data(image, lang="chi_sim43", output_type="dict") if image.size[0] == 4032:
image = image.rotate(-90)
imgaetype = 2
content = pytesseract.image_to_data(image, lang="chi_sim12", output_type="dict")
for i in range(len(content["text"])): for i in range(len(content["text"])):
if 0 < len(content["text"][i]): if 0 < len(content["text"][i]):
if content["text"][i] == "姓名" or ( if content["text"][i] == "姓名" or content["text"][i] == "姓":
content["text"][i] == "姓" and content["text"][i + 1] == "名"
):
(x, y, w, h) = ( (x, y, w, h) = (
content["left"][i], content["left"][i],
content["top"][i], content["top"][i],
...@@ -24,7 +26,11 @@ for root, dirs, files in os.walk("labimage/"): ...@@ -24,7 +26,11 @@ for root, dirs, files in os.walk("labimage/"):
content["height"][i], content["height"][i],
) )
print(x, y, w, h) print(x, y, w, h)
img = image.crop((x - 10, y - 10, x + w + 400, y + h + 30)) if imgaetype == 1:
box = (x - 10, y - 10, x + w + 400, y + h + 30)
else:
box = (x - 5, y - 5, x + w + 400, y + h + 10)
img = image.crop(box)
content = pytesseract.image_to_string( content = pytesseract.image_to_string(
img, lang="chi_sim43", output_type="dict" img, lang="chi_sim43", output_type="dict"
) )
...@@ -35,11 +41,18 @@ for root, dirs, files in os.walk("labimage/"): ...@@ -35,11 +41,18 @@ for root, dirs, files in os.walk("labimage/"):
content = pytesseract.image_to_string( content = pytesseract.image_to_string(
img, lang="chi_sim43", output_type="dict" img, lang="chi_sim43", output_type="dict"
) )
elif content["text"][0] != "姓":
filename = "./Cache/" + file
img.save(filename)
img = cv2.imread(filename)
content = pytesseract.image_to_string(
img, lang="chi_sim12", output_type="dict"
)
cnt = content["text"] cnt = content["text"]
cnt = cnt.replace(" ", "") cnt = cnt.replace(" ", "")
cnt = cnt[3:] cnt = cnt[3:]
print(cnt) print(cnt)
image.paste((0, 0, 0), (x - 10, y - 10, x + w + 400, y + h + 30)) image.paste((0, 0, 0), box)
image.save("./Output/" + file) image.save("./Output/" + file)
data = pd.DataFrame({"name": [cnt], "dir": [root + "/" + file]}) data = pd.DataFrame({"name": [cnt], "dir": [root + "/" + file]})
data.to_csv("name2file.csv", mode="a", header=False) data.to_csv("name2file.csv", mode="a", header=False)
......
,name,dir 0,禹星航,labimage//1.jpg
0,黄梅梅,labimage/42/154045368169150956.jpg
0,何世云,labimage/43/154045384343030182.jpg
0,刘春佛,labimage/44/154045446763420735.jpg
0,,labimage/45/154045530108690559.jpg
0,许赞国,labimage/46/154045562332620522.jpg
0,黄宇公,labimage/47/154045606255320567.jpg
0,禹星航,labimage//1.jpg
0,黄梅梅,labimage/42/154045368169150956.jpg 0,黄梅梅,labimage/42/154045368169150956.jpg
0,何世云,labimage/43/154045384343030182.jpg 0,何世云,labimage/43/154045384343030182.jpg
0,刘春佛,labimage/44/154045446763420735.jpg 0,刘春佛,labimage/44/154045446763420735.jpg
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment