diff --git a/Output/1.jpg b/Output/1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..813320f75e6f16900810943cc3d4b2d782370521 Binary files /dev/null and b/Output/1.jpg differ diff --git a/Output/154045368169150956.jpg b/Output/154045368169150956.jpg index 06756d3890ff994bd90efa2bf00df483062969f7..467d3279d33dc993172e57312c8f822c1abe3c8e 100644 Binary files a/Output/154045368169150956.jpg and b/Output/154045368169150956.jpg differ diff --git a/Output/154045651362010599.jpg b/Output/154045651362010599.jpg index 3a916ab0d105e22f1448c3b0e4e6179e258def98..09a7654c19d68fbfa6476f8078e77a3544d8605b 100644 Binary files a/Output/154045651362010599.jpg and b/Output/154045651362010599.jpg differ diff --git a/labimage/1.jpg b/labimage/1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..31b30788de711663a6b94630dd075d673a99e178 Binary files /dev/null and b/labimage/1.jpg differ diff --git a/main.py b/main.py index 4c0a278aabc32da1e8319e171786e36a489e5acf..77d31aa35d2ddded2c0f4332540d50e6c69ab081 100644 --- a/main.py +++ b/main.py @@ -10,13 +10,15 @@ tesseract_cmd = r".\tesseract-ocr\tesseract.exe" for root, dirs, files in os.walk("labimage/"): for file in files: + imgaetype = 1 image = Image.open(root + "/" + file) - content = pytesseract.image_to_data(image, lang="chi_sim43", output_type="dict") + if image.size[0] == 4032: + image = image.rotate(-90) + imgaetype = 2 + content = pytesseract.image_to_data(image, lang="chi_sim12", output_type="dict") for i in range(len(content["text"])): if 0 < len(content["text"][i]): - if content["text"][i] == "姓名" or ( - content["text"][i] == "姓" and content["text"][i + 1] == "名" - ): + if content["text"][i] == "姓名" or content["text"][i] == "姓": (x, y, w, h) = ( content["left"][i], content["top"][i], @@ -24,7 +26,11 @@ for root, dirs, files in os.walk("labimage/"): content["height"][i], ) print(x, y, w, h) - img = image.crop((x - 10, y - 10, x + w + 400, y + h + 30)) + if imgaetype == 1: + box = (x - 10, y - 10, x + w + 400, y + h + 30) + else: + box = (x - 5, y - 5, x + w + 400, y + h + 10) + img = image.crop(box) content = pytesseract.image_to_string( img, lang="chi_sim43", output_type="dict" ) @@ -35,11 +41,18 @@ for root, dirs, files in os.walk("labimage/"): content = pytesseract.image_to_string( img, lang="chi_sim43", output_type="dict" ) + elif content["text"][0] != "姓": + filename = "./Cache/" + file + img.save(filename) + img = cv2.imread(filename) + content = pytesseract.image_to_string( + img, lang="chi_sim12", output_type="dict" + ) cnt = content["text"] cnt = cnt.replace(" ", "") cnt = cnt[3:] print(cnt) - image.paste((0, 0, 0), (x - 10, y - 10, x + w + 400, y + h + 30)) + image.paste((0, 0, 0), box) image.save("./Output/" + file) data = pd.DataFrame({"name": [cnt], "dir": [root + "/" + file]}) data.to_csv("name2file.csv", mode="a", header=False) diff --git a/name2file.csv b/name2file.csv index 14d8ba04aecb154dc996d65f15a0b185df59dc75..515ad78183786a06ed9ca6cd90614f4a0b546fcb 100644 --- a/name2file.csv +++ b/name2file.csv @@ -1,4 +1,11 @@ -,name,dir +0,禹星航,labimage//1.jpg +0,黄梅梅,labimage/42/154045368169150956.jpg +0,何世云,labimage/43/154045384343030182.jpg +0,刘春佛,labimage/44/154045446763420735.jpg +0,,labimage/45/154045530108690559.jpg +0,许赞国,labimage/46/154045562332620522.jpg +0,黄宇公,labimage/47/154045606255320567.jpg +0,禹星航,labimage//1.jpg 0,黄梅梅,labimage/42/154045368169150956.jpg 0,何世云,labimage/43/154045384343030182.jpg 0,刘春佛,labimage/44/154045446763420735.jpg diff --git a/tesseract-ocr/tessdata/chi_sim12.traineddata b/tesseract-ocr/tessdata/chi_sim12.traineddata new file mode 100644 index 0000000000000000000000000000000000000000..da7fa49ded2895ebe974c6ea3a6cc1af8595f1af Binary files /dev/null and b/tesseract-ocr/tessdata/chi_sim12.traineddata differ