enhanced robustness

c70c0960 · EvilCalf · f2b919b5 · c70c0960 · f2b919b5 · c70c0960
Commit c70c0960 authored Dec 11, 2019 by EvilCalf
7 changed files
--- a/Output/1.jpg
+++ b/Output/1.jpg
--- a/Output/154045368169150956.jpg
+++ b/Output/154045368169150956.jpg
--- a/Output/154045651362010599.jpg
+++ b/Output/154045651362010599.jpg
--- a/labimage/1.jpg
+++ b/labimage/1.jpg
--- a/main.py
+++ b/main.py
@@ -10,13 +10,15 @@ tesseract_cmd = r".\tesseract-ocr\tesseract.exe"
 for root, dirs, files in os.walk("labimage/"):
    for file in files:
+        imgaetype = 1
        image = Image.open(root + "/" + file)
-        content = pytesseract.image_to_data(image, lang="chi_sim43", output_type="dict")
+        if image.size[0] == 4032:
+            image = image.rotate(-90)
+            imgaetype = 2
+        content = pytesseract.image_to_data(image, lang="chi_sim12", output_type="dict")
        for i in range(len(content["text"])):
            if 0 < len(content["text"][i]):
-                if content["text"][i] == "姓名" or (
+                if content["text"][i] == "姓名" or content["text"][i] == "姓":
-                    content["text"][i] == "姓" and content["text"][i + 1] == "名"
-                ):
                    (x, y, w, h) = (
                        content["left"][i],
                        content["top"][i],
@@ -24,7 +26,11 @@ for root, dirs, files in os.walk("labimage/"):
                        content["height"][i],
                    )
                    print(x, y, w, h)
-                    img = image.crop((x - 10, y - 10, x + w + 400, y + h + 30))
+                    if imgaetype == 1:
+                        box = (x - 10, y - 10, x + w + 400, y + h + 30)
+                    else:
+                        box = (x - 5, y - 5, x + w + 400, y + h + 10)
+                    img = image.crop(box)
                    content = pytesseract.image_to_string(
                        img, lang="chi_sim43", output_type="dict"
                    )
@@ -35,11 +41,18 @@ for root, dirs, files in os.walk("labimage/"):
                        content = pytesseract.image_to_string(
                            img, lang="chi_sim43", output_type="dict"
                        )
+                    elif content["text"][0] != "姓":
+                        filename = "./Cache/" + file
+                        img.save(filename)
+                        img = cv2.imread(filename)
+                        content = pytesseract.image_to_string(
+                            img, lang="chi_sim12", output_type="dict"
+                        )
                    cnt = content["text"]
                    cnt = cnt.replace(" ", "")
                    cnt = cnt[3:]
                    print(cnt)
-                    image.paste((0, 0, 0), (x - 10, y - 10, x + w + 400, y + h + 30))
+                    image.paste((0, 0, 0), box)
                    image.save("./Output/" + file)
                    data = pd.DataFrame({"name": [cnt], "dir": [root + "/" + file]})
                    data.to_csv("name2file.csv", mode="a", header=False)

--- a/name2file.csv
+++ b/name2file.csv
-,name,dir
+0,禹星航,labimage//1.jpg
+0,黄梅梅,labimage/42/154045368169150956.jpg
+0,何世云,labimage/43/154045384343030182.jpg
+0,刘春佛,labimage/44/154045446763420735.jpg
+0,,labimage/45/154045530108690559.jpg
+0,许赞国,labimage/46/154045562332620522.jpg
+0,黄宇公,labimage/47/154045606255320567.jpg
+0,禹星航,labimage//1.jpg
 0,黄梅梅,labimage/42/154045368169150956.jpg
 0,何世云,labimage/43/154045384343030182.jpg
 0,刘春佛,labimage/44/154045446763420735.jpg

--- a/tesseract-ocr/tessdata/chi_sim12.traineddata
+++ b/tesseract-ocr/tessdata/chi_sim12.traineddata