diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6bd962b18bd6487894d36a86a4a24f687571d1b9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,75 @@ +# Vue +.DS_Store +node_modules +/dist + +# Eclipse +.classpath +.project +.settings/ + +# Intel Idea +.idea +*.iml +*.iws + +# Maven +log +target +pom.xml.tag +pom.xml.releaseBackup +pom.xml.versionBackup +pom.xml.next +release.properties +dependcy-reduced-pom.xml +buildNumber.properties + +# java +*.class +*.war +*.ear + +# bak +*.bak +/bin/ + +# sbt +/target/ +/project/target/ +/project/project/target/ +/project/project/project/target/ +/build-sbt/ +local.sbt + +# spring +*.springBeans + +# Editor directories and files +.idea +.vscode +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw* + +# local env files +.env.local +.env.*.local + +# Log files +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# python +__pycache__ + +# VSC Counter +/.VSCodeCounter/ + +# data +/data/ + +# cache +/Cache/*.jpg diff --git a/Output/154045368169150956.jpg b/Output/154045368169150956.jpg new file mode 100644 index 0000000000000000000000000000000000000000..06756d3890ff994bd90efa2bf00df483062969f7 Binary files /dev/null and b/Output/154045368169150956.jpg differ diff --git a/Output/154045384343030182.jpg b/Output/154045384343030182.jpg new file mode 100644 index 0000000000000000000000000000000000000000..85480a7783a62357c883e8e6f7a396a906f8693d Binary files /dev/null and b/Output/154045384343030182.jpg differ diff --git a/Output/154045446763420735.jpg b/Output/154045446763420735.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2d9eb0c5d63a710862d4022b42d4a8d93c1cd719 Binary files /dev/null and b/Output/154045446763420735.jpg differ diff --git a/Output/154045530108690559.jpg b/Output/154045530108690559.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6b8dffe7ecc4174a05b7d391614ca61df493dace Binary files /dev/null and b/Output/154045530108690559.jpg differ diff --git a/Output/154045562332620522.jpg b/Output/154045562332620522.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f03b1454c07b51b36cc71872a84d30c83736bfb3 Binary files /dev/null and b/Output/154045562332620522.jpg differ diff --git a/Output/154045606255320567.jpg b/Output/154045606255320567.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c16c6fb1d1d26f15a0074a84050ef11546c8c5a8 Binary files /dev/null and b/Output/154045606255320567.jpg differ diff --git a/Output/154045651362010523.jpg b/Output/154045651362010523.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ee8da565c27eb7ca2b7c4731f8172fc3d29aa3e7 Binary files /dev/null and b/Output/154045651362010523.jpg differ diff --git a/Output/154045651362010599.jpg b/Output/154045651362010599.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3a916ab0d105e22f1448c3b0e4e6179e258def98 Binary files /dev/null and b/Output/154045651362010599.jpg differ diff --git a/README.md b/README.md index 09307a4ddbc49d357e9add773be6405b290b1555..f8f48ea3b094b9c6be22d3db2b9588c7796cb6f8 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# OCR 脱敏工具DEMO +# OCR 脱敏工具DEMO \ No newline at end of file diff --git a/labimage/40/153828071104640671.jpg b/labimage/40/153828071104640671.jpg new file mode 100644 index 0000000000000000000000000000000000000000..370c5123d9a9400cfa29718564cdef50193d5bce Binary files /dev/null and b/labimage/40/153828071104640671.jpg differ diff --git a/labimage/41/153828093038760836.jpg b/labimage/41/153828093038760836.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5d5f91c29f89d3e7570b33fef4824278834288a2 Binary files /dev/null and b/labimage/41/153828093038760836.jpg differ diff --git a/labimage/42/154045368169150956.jpg b/labimage/42/154045368169150956.jpg new file mode 100644 index 0000000000000000000000000000000000000000..63a51f248e3a8985d02d16f966504b531fe466a0 Binary files /dev/null and b/labimage/42/154045368169150956.jpg differ diff --git a/labimage/43/154045384343030182.jpg b/labimage/43/154045384343030182.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d19eed5afbe9364dac55738e526ee852bd1a7722 Binary files /dev/null and b/labimage/43/154045384343030182.jpg differ diff --git a/labimage/44/154045446763420735.jpg b/labimage/44/154045446763420735.jpg new file mode 100644 index 0000000000000000000000000000000000000000..939c368efa16161f06a6a96978b8710033ebb547 Binary files /dev/null and b/labimage/44/154045446763420735.jpg differ diff --git a/labimage/45/154045530108690559.jpg b/labimage/45/154045530108690559.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7c7515778465d73c8db984fe289e5af84a71246f Binary files /dev/null and b/labimage/45/154045530108690559.jpg differ diff --git a/labimage/46/154045562332620522.jpg b/labimage/46/154045562332620522.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9e42e6e5d5a1f5d898e466f4eda80f76126d470e Binary files /dev/null and b/labimage/46/154045562332620522.jpg differ diff --git a/labimage/47/154045606255320567.jpg b/labimage/47/154045606255320567.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f5e8eae9b476d0f6bcd0759be1ca29d789075546 Binary files /dev/null and b/labimage/47/154045606255320567.jpg differ diff --git a/labimage/48/154045651362010523.jpg b/labimage/48/154045651362010523.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4dc8509d281b5ccaca81c08ec105f1d1a3ab634d Binary files /dev/null and b/labimage/48/154045651362010523.jpg differ diff --git a/labimage/49/154045651362010599.jpg b/labimage/49/154045651362010599.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c81466d556373e43725da0531b159cf4ac13d219 Binary files /dev/null and b/labimage/49/154045651362010599.jpg differ diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..8756275020246702baeb66e73239646020c6e22e --- /dev/null +++ b/main.py @@ -0,0 +1,55 @@ +import cv2 +import pytesseract +from PIL import Image +from PIL import ImageDraw +from PIL import ImageFont +from PIL import ImageGrab +import numpy as np +import os +import pandas as pd + + +tesseract_cmd = r'.\tesseract-ocr\tesseract.exe' + +for root, dirs, files in os.walk("labimage/"): + for file in files: + image = Image.open(root + "/" + file) + content = pytesseract.image_to_data( + image, lang="chi_sim43", output_type="dict" + ) + for i in range(len(content["text"])): + if 0 < len(content["text"][i]): + if content["text"][i] == "姓名" or ( + content["text"][i] == "姓" and content["text"][i + 1] == "名" + ): + (x, y, w, h) = ( + content["left"][i], + content["top"][i], + content["width"][i], + content["height"][i], + ) + print(x, y, w, h) + img = image.crop((x - 10, y - 10, x + w + 400, y + h + 30)) + content = pytesseract.image_to_string( + img, lang="chi_sim43", output_type="dict" + ) + if content["text"] == "": + filename = "./Cache/" + file + img.save(filename) + img = cv2.imread(filename) + content = pytesseract.image_to_string( + img, lang="chi_sim43", output_type="dict" + ) + cnt = content["text"] + cnt = cnt.replace(" ", "") + cnt = cnt[3:] + print(cnt) + image.paste((0, 0, 0), (x - 10, y - 10, x + w + 400, y + h + 30)) + image.save("./Output/" + file) + data = pd.DataFrame({'name': [cnt], 'dir': [root + "/" + file]}) + data.to_csv("name2file.csv",mode='a',header=False) + + break + + + diff --git a/name2file.csv b/name2file.csv new file mode 100644 index 0000000000000000000000000000000000000000..14d8ba04aecb154dc996d65f15a0b185df59dc75 --- /dev/null +++ b/name2file.csv @@ -0,0 +1,9 @@ +,name,dir +0,黄梅梅,labimage/42/154045368169150956.jpg +0,何世云,labimage/43/154045384343030182.jpg +0,刘春佛,labimage/44/154045446763420735.jpg +0,林美兰,labimage/45/154045530108690559.jpg +0,许赞国,labimage/46/154045562332620522.jpg +0,黄守云,labimage/47/154045606255320567.jpg +0,黄秀明,labimage/48/154045651362010523.jpg +0,刘建霞,labimage/49/154045651362010599.jpg diff --git a/tesseract-ocr/ambiguous_words.exe b/tesseract-ocr/ambiguous_words.exe new file mode 100644 index 0000000000000000000000000000000000000000..5ba4705a3a65d777e27155bf9a63425d87c3f7be Binary files /dev/null and b/tesseract-ocr/ambiguous_words.exe differ diff --git a/tesseract-ocr/classifier_tester.exe b/tesseract-ocr/classifier_tester.exe new file mode 100644 index 0000000000000000000000000000000000000000..d3e3cef01cee9feb949ff080d2f341ce1a8fee8f Binary files /dev/null and b/tesseract-ocr/classifier_tester.exe differ diff --git a/tesseract-ocr/cntraining.exe b/tesseract-ocr/cntraining.exe new file mode 100644 index 0000000000000000000000000000000000000000..f91c53e42b86658246b14c084d3499ec1af99f63 Binary files /dev/null and b/tesseract-ocr/cntraining.exe differ diff --git a/tesseract-ocr/combine_lang_model.exe b/tesseract-ocr/combine_lang_model.exe new file mode 100644 index 0000000000000000000000000000000000000000..05bfb8b4695d38d0cc8d462b550540541ff8e611 Binary files /dev/null and b/tesseract-ocr/combine_lang_model.exe differ diff --git a/tesseract-ocr/combine_tessdata.exe b/tesseract-ocr/combine_tessdata.exe new file mode 100644 index 0000000000000000000000000000000000000000..42f6992585980cd240cbe26ecc260749119c264d Binary files /dev/null and b/tesseract-ocr/combine_tessdata.exe differ diff --git a/tesseract-ocr/dawg2wordlist.exe b/tesseract-ocr/dawg2wordlist.exe new file mode 100644 index 0000000000000000000000000000000000000000..8ade731584d55b27d286868d90bfbda19b688f95 Binary files /dev/null and b/tesseract-ocr/dawg2wordlist.exe differ diff --git a/tesseract-ocr/doc/AUTHORS b/tesseract-ocr/doc/AUTHORS new file mode 100644 index 0000000000000000000000000000000000000000..633bf028482fbcd34fde34d8bf798ca4c2a11c1f --- /dev/null +++ b/tesseract-ocr/doc/AUTHORS @@ -0,0 +1,44 @@ +Ray Smith (lead developer) +Ahmad Abdulkader +Rika Antonova +Nicholas Beato +Jeff Breidenbach +Samuel Charron +Phil Cheatle +Simon Crouch +David Eger +Sheelagh Huddleston +Dan Johnson +Rajesh Katikam +Thomas Kielbus +Dar-Shyang Lee +Zongyi (Joe) Liu +Robert Moss +Chris Newton +Michael Reimer +Marius Renn +Raquel Romano +Christy Russon +Shobhit Saxena +Mark Seaman +Faisal Shafait +Hiroshi Takenaka +Ranjith Unnikrishnan +Joern Wanke +Ping Ping Xiu +Andrew Ziem +Oscar Zuniga + +Community Contributors: +Zdenko Podobný (Maintainer) +Jim Regan (Maintainer) +James R Barlow +Amit Dovev +Martin Ettl +Shree Devi Kumar +Noah Metzger +Tom Morris +Tobias Müller +Egor Pugin +Sundar M. Vaidya +Stefan Weil diff --git a/tesseract-ocr/doc/COPYING b/tesseract-ocr/doc/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..be5ebaedcf7536337c3dbf71ef33c4c45f860366 --- /dev/null +++ b/tesseract-ocr/doc/COPYING @@ -0,0 +1,21 @@ +This package contains the Tesseract Open Source OCR Engine. +Originally developed at Hewlett Packard Laboratories Bristol and +at Hewlett Packard Co, Greeley Colorado, all the code +in this distribution is now licensed under the Apache License: + +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** http://www.apache.org/licenses/LICENSE-2.0 +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. + + +Other Dependencies and Licenses: +================================ + +Tesseract uses Leptonica library (http://leptonica.com/) which essentially +uses a BSD 2-clause license. (http://leptonica.com/about-the-license.html) diff --git a/tesseract-ocr/iconv.dll b/tesseract-ocr/iconv.dll new file mode 100644 index 0000000000000000000000000000000000000000..2f137f4b50188468c821ce859d67d25d2237dc62 Binary files /dev/null and b/tesseract-ocr/iconv.dll differ diff --git a/tesseract-ocr/icudata57.dll b/tesseract-ocr/icudata57.dll new file mode 100644 index 0000000000000000000000000000000000000000..5ddff724642f31e2e36760330566f228cf3b5919 Binary files /dev/null and b/tesseract-ocr/icudata57.dll differ diff --git a/tesseract-ocr/icudt57.dll b/tesseract-ocr/icudt57.dll new file mode 100644 index 0000000000000000000000000000000000000000..ff63fbcd7d76e69d7b4abb5113fe0bfb953a7b20 Binary files /dev/null and b/tesseract-ocr/icudt57.dll differ diff --git a/tesseract-ocr/icui18n57.dll b/tesseract-ocr/icui18n57.dll new file mode 100644 index 0000000000000000000000000000000000000000..574935f64a06c0170f35649c67d4443657f94b40 Binary files /dev/null and b/tesseract-ocr/icui18n57.dll differ diff --git a/tesseract-ocr/icuuc57.dll b/tesseract-ocr/icuuc57.dll new file mode 100644 index 0000000000000000000000000000000000000000..184a6b8a095f2f36114c677b7fe3307b061703cd Binary files /dev/null and b/tesseract-ocr/icuuc57.dll differ diff --git a/tesseract-ocr/java/ScrollView.jar b/tesseract-ocr/java/ScrollView.jar new file mode 100644 index 0000000000000000000000000000000000000000..1663843dd2f1d65b82674acb4ac9c1bf232798a7 Binary files /dev/null and b/tesseract-ocr/java/ScrollView.jar differ diff --git a/tesseract-ocr/java/jaxb-api-2.3.1.jar b/tesseract-ocr/java/jaxb-api-2.3.1.jar new file mode 100644 index 0000000000000000000000000000000000000000..45658654712b88d45c9464286ffc2fcb07036bdf Binary files /dev/null and b/tesseract-ocr/java/jaxb-api-2.3.1.jar differ diff --git a/tesseract-ocr/java/piccolo2d-core-3.0.jar b/tesseract-ocr/java/piccolo2d-core-3.0.jar new file mode 100644 index 0000000000000000000000000000000000000000..1e2cc77a63cf427407f8b8f88683001805510cef Binary files /dev/null and b/tesseract-ocr/java/piccolo2d-core-3.0.jar differ diff --git a/tesseract-ocr/java/piccolo2d-extras-3.0.jar b/tesseract-ocr/java/piccolo2d-extras-3.0.jar new file mode 100644 index 0000000000000000000000000000000000000000..074a61f2cddc7c948b4afbecefc41610684c69a8 Binary files /dev/null and b/tesseract-ocr/java/piccolo2d-extras-3.0.jar differ diff --git a/tesseract-ocr/libbz2-1.dll b/tesseract-ocr/libbz2-1.dll new file mode 100644 index 0000000000000000000000000000000000000000..068c2abe19432e22dd5f79e47a0948d78f6186f9 Binary files /dev/null and b/tesseract-ocr/libbz2-1.dll differ diff --git a/tesseract-ocr/libcairo-2.dll b/tesseract-ocr/libcairo-2.dll new file mode 100644 index 0000000000000000000000000000000000000000..4cf4c86ef6e98441ad303646699ceee66177bc10 Binary files /dev/null and b/tesseract-ocr/libcairo-2.dll differ diff --git a/tesseract-ocr/libexpat-1.dll b/tesseract-ocr/libexpat-1.dll new file mode 100644 index 0000000000000000000000000000000000000000..d1ffd53e4758dcdc2e427b191c7b2ea05bfc430f Binary files /dev/null and b/tesseract-ocr/libexpat-1.dll differ diff --git a/tesseract-ocr/libffi-6.dll b/tesseract-ocr/libffi-6.dll new file mode 100644 index 0000000000000000000000000000000000000000..d21fb0d92d0cdc33f717038038bf02f6375af6ff Binary files /dev/null and b/tesseract-ocr/libffi-6.dll differ diff --git a/tesseract-ocr/libfontconfig-1.dll b/tesseract-ocr/libfontconfig-1.dll new file mode 100644 index 0000000000000000000000000000000000000000..f8e0f6dffac3f7211a2a20756e6ba1886288ca1b Binary files /dev/null and b/tesseract-ocr/libfontconfig-1.dll differ diff --git a/tesseract-ocr/libfreetype-6.dll b/tesseract-ocr/libfreetype-6.dll new file mode 100644 index 0000000000000000000000000000000000000000..9f48a1c16501c73ae7685c0a8f93fc031131b290 Binary files /dev/null and b/tesseract-ocr/libfreetype-6.dll differ diff --git a/tesseract-ocr/libgcc_s_seh-1.dll b/tesseract-ocr/libgcc_s_seh-1.dll new file mode 100644 index 0000000000000000000000000000000000000000..20919f643cbf7e7c707c076ce4d3d8536f6b3a3f Binary files /dev/null and b/tesseract-ocr/libgcc_s_seh-1.dll differ diff --git a/tesseract-ocr/libgcc_s_sjlj-1.dll b/tesseract-ocr/libgcc_s_sjlj-1.dll new file mode 100644 index 0000000000000000000000000000000000000000..718476406533d34632b737dec32b9e2b30dedc50 Binary files /dev/null and b/tesseract-ocr/libgcc_s_sjlj-1.dll differ diff --git a/tesseract-ocr/libgif-7.dll b/tesseract-ocr/libgif-7.dll new file mode 100644 index 0000000000000000000000000000000000000000..a1f890dedf462f699e2f24890bd5169a75b2c5dd Binary files /dev/null and b/tesseract-ocr/libgif-7.dll differ diff --git a/tesseract-ocr/libglib-2.0-0.dll b/tesseract-ocr/libglib-2.0-0.dll new file mode 100644 index 0000000000000000000000000000000000000000..025b18f4c3cbdd951e104f56685b9b1e74fbdc2e Binary files /dev/null and b/tesseract-ocr/libglib-2.0-0.dll differ diff --git a/tesseract-ocr/libgobject-2.0-0.dll b/tesseract-ocr/libgobject-2.0-0.dll new file mode 100644 index 0000000000000000000000000000000000000000..974c519937bc2f55190097bd73001cc809ed3cd3 Binary files /dev/null and b/tesseract-ocr/libgobject-2.0-0.dll differ diff --git a/tesseract-ocr/libgomp-1.dll b/tesseract-ocr/libgomp-1.dll new file mode 100644 index 0000000000000000000000000000000000000000..5005266c788a09cd799e586e6c2b4aed6397990a Binary files /dev/null and b/tesseract-ocr/libgomp-1.dll differ diff --git a/tesseract-ocr/libharfbuzz-0.dll b/tesseract-ocr/libharfbuzz-0.dll new file mode 100644 index 0000000000000000000000000000000000000000..bd2c316b61f0602a2f7f7fe642aa1c393a77dcc1 Binary files /dev/null and b/tesseract-ocr/libharfbuzz-0.dll differ diff --git a/tesseract-ocr/libintl-8.dll b/tesseract-ocr/libintl-8.dll new file mode 100644 index 0000000000000000000000000000000000000000..38cee93f2a9233e51ddc022ab8655522141814b2 Binary files /dev/null and b/tesseract-ocr/libintl-8.dll differ diff --git a/tesseract-ocr/libjbig-2.dll b/tesseract-ocr/libjbig-2.dll new file mode 100644 index 0000000000000000000000000000000000000000..7f1504ef21035af17e4d1b6d444542f38e534fc6 Binary files /dev/null and b/tesseract-ocr/libjbig-2.dll differ diff --git a/tesseract-ocr/libjpeg-8.dll b/tesseract-ocr/libjpeg-8.dll new file mode 100644 index 0000000000000000000000000000000000000000..fb78f7d910f06071e68a261dd8fc8022bbcd3b46 Binary files /dev/null and b/tesseract-ocr/libjpeg-8.dll differ diff --git a/tesseract-ocr/liblept-5.dll b/tesseract-ocr/liblept-5.dll new file mode 100644 index 0000000000000000000000000000000000000000..f22cc4581e52c51d5e0fd77bd90706ec51514a1b Binary files /dev/null and b/tesseract-ocr/liblept-5.dll differ diff --git a/tesseract-ocr/liblzma-5.dll b/tesseract-ocr/liblzma-5.dll new file mode 100644 index 0000000000000000000000000000000000000000..8eca0ee96784704acec3f59ea7c5b46add2d5de5 Binary files /dev/null and b/tesseract-ocr/liblzma-5.dll differ diff --git a/tesseract-ocr/libopenjp2.dll b/tesseract-ocr/libopenjp2.dll new file mode 100644 index 0000000000000000000000000000000000000000..27aec5288310127a8ea18ba318db9c0a7f060763 Binary files /dev/null and b/tesseract-ocr/libopenjp2.dll differ diff --git a/tesseract-ocr/libpango-1.0-0.dll b/tesseract-ocr/libpango-1.0-0.dll new file mode 100644 index 0000000000000000000000000000000000000000..787201e2dd2a252610c1e5f5f2e40c2f92bc4ce3 Binary files /dev/null and b/tesseract-ocr/libpango-1.0-0.dll differ diff --git a/tesseract-ocr/libpangocairo-1.0-0.dll b/tesseract-ocr/libpangocairo-1.0-0.dll new file mode 100644 index 0000000000000000000000000000000000000000..674987b5f5e5926c427f72e3b16ec4966a17e50d Binary files /dev/null and b/tesseract-ocr/libpangocairo-1.0-0.dll differ diff --git a/tesseract-ocr/libpangoft2-1.0-0.dll b/tesseract-ocr/libpangoft2-1.0-0.dll new file mode 100644 index 0000000000000000000000000000000000000000..3493c4e4c01f8e30fbc06dd517633a5a815f1fff Binary files /dev/null and b/tesseract-ocr/libpangoft2-1.0-0.dll differ diff --git a/tesseract-ocr/libpangowin32-1.0-0.dll b/tesseract-ocr/libpangowin32-1.0-0.dll new file mode 100644 index 0000000000000000000000000000000000000000..bb99d308bf0b3673f593f65860e1072c0f0754c5 Binary files /dev/null and b/tesseract-ocr/libpangowin32-1.0-0.dll differ diff --git a/tesseract-ocr/libpcre-1.dll b/tesseract-ocr/libpcre-1.dll new file mode 100644 index 0000000000000000000000000000000000000000..b5715aeb5a9338fa57872b1d0d777e949e2bfa6f Binary files /dev/null and b/tesseract-ocr/libpcre-1.dll differ diff --git a/tesseract-ocr/libpixman-1-0.dll b/tesseract-ocr/libpixman-1-0.dll new file mode 100644 index 0000000000000000000000000000000000000000..eec2f0aab8b9793b1be12e5457a7fd33da1f60fb Binary files /dev/null and b/tesseract-ocr/libpixman-1-0.dll differ diff --git a/tesseract-ocr/libpng16-16.dll b/tesseract-ocr/libpng16-16.dll new file mode 100644 index 0000000000000000000000000000000000000000..f7682feef664afe38c0a58c448488c58976935ff Binary files /dev/null and b/tesseract-ocr/libpng16-16.dll differ diff --git a/tesseract-ocr/libstdc++-6.dll b/tesseract-ocr/libstdc++-6.dll new file mode 100644 index 0000000000000000000000000000000000000000..00f3184f4e68c25a559e3593844b8883fd492b9d Binary files /dev/null and b/tesseract-ocr/libstdc++-6.dll differ diff --git a/tesseract-ocr/libtesseract-4.dll b/tesseract-ocr/libtesseract-4.dll new file mode 100644 index 0000000000000000000000000000000000000000..20ce636bef35f126248b9eccf64eed1fbcd7f4f4 Binary files /dev/null and b/tesseract-ocr/libtesseract-4.dll differ diff --git a/tesseract-ocr/libtiff-5.dll b/tesseract-ocr/libtiff-5.dll new file mode 100644 index 0000000000000000000000000000000000000000..f142f3f632f1e4ec323df93fa262bb90cb25de5d Binary files /dev/null and b/tesseract-ocr/libtiff-5.dll differ diff --git a/tesseract-ocr/libwebp-7.dll b/tesseract-ocr/libwebp-7.dll new file mode 100644 index 0000000000000000000000000000000000000000..ec8b1f09ee23ae921463b07f187f89eb5a9c5afa Binary files /dev/null and b/tesseract-ocr/libwebp-7.dll differ diff --git a/tesseract-ocr/libwinpthread-1.dll b/tesseract-ocr/libwinpthread-1.dll new file mode 100644 index 0000000000000000000000000000000000000000..6138e82b011442bda8e4f0df0aa797a429573a51 Binary files /dev/null and b/tesseract-ocr/libwinpthread-1.dll differ diff --git a/tesseract-ocr/lstmeval.exe b/tesseract-ocr/lstmeval.exe new file mode 100644 index 0000000000000000000000000000000000000000..6cf18345934f00547c29f79807f00984571eda2f Binary files /dev/null and b/tesseract-ocr/lstmeval.exe differ diff --git a/tesseract-ocr/lstmtraining.exe b/tesseract-ocr/lstmtraining.exe new file mode 100644 index 0000000000000000000000000000000000000000..cfbabaa6aa008338972f46c153bbf3d8e0d7521c Binary files /dev/null and b/tesseract-ocr/lstmtraining.exe differ diff --git a/tesseract-ocr/merge_unicharsets.exe b/tesseract-ocr/merge_unicharsets.exe new file mode 100644 index 0000000000000000000000000000000000000000..87d47cea09a4cd0c24c79a2c36d5ce98a84eaae9 Binary files /dev/null and b/tesseract-ocr/merge_unicharsets.exe differ diff --git a/tesseract-ocr/mftraining.exe b/tesseract-ocr/mftraining.exe new file mode 100644 index 0000000000000000000000000000000000000000..e2806b52c284ae68ae53b1a2645d78c5adcdd83f Binary files /dev/null and b/tesseract-ocr/mftraining.exe differ diff --git a/tesseract-ocr/set_unicharset_properties.exe b/tesseract-ocr/set_unicharset_properties.exe new file mode 100644 index 0000000000000000000000000000000000000000..9022e72c68145a8e2151ff5da86add6c65e1c811 Binary files /dev/null and b/tesseract-ocr/set_unicharset_properties.exe differ diff --git a/tesseract-ocr/shapeclustering.exe b/tesseract-ocr/shapeclustering.exe new file mode 100644 index 0000000000000000000000000000000000000000..ad3c760984c866d804fa965ac906b29b6a810d86 Binary files /dev/null and b/tesseract-ocr/shapeclustering.exe differ diff --git a/tesseract-ocr/tessdata/chi_sim43.traineddata b/tesseract-ocr/tessdata/chi_sim43.traineddata new file mode 100644 index 0000000000000000000000000000000000000000..eeb66cfbd9c02b170a6aeeece673910793d8d8c4 Binary files /dev/null and b/tesseract-ocr/tessdata/chi_sim43.traineddata differ diff --git a/tesseract-ocr/tessdata/configs/ambigs.train b/tesseract-ocr/tessdata/configs/ambigs.train new file mode 100644 index 0000000000000000000000000000000000000000..23035a1904cfb8a2e5ad143ac638447bc1b04b4c --- /dev/null +++ b/tesseract-ocr/tessdata/configs/ambigs.train @@ -0,0 +1,7 @@ +tessedit_ambigs_training 1 +load_freq_dawg 0 +load_punc_dawg 0 +load_system_dawg 0 +load_number_dawg 0 +ambigs_debug_level 3 +load_fixed_length_dawgs 0 diff --git a/tesseract-ocr/tessdata/configs/api_config b/tesseract-ocr/tessdata/configs/api_config new file mode 100644 index 0000000000000000000000000000000000000000..5cd6ec0310213adbc59e5c48a49f858daf3cdc4f --- /dev/null +++ b/tesseract-ocr/tessdata/configs/api_config @@ -0,0 +1 @@ +tessedit_zero_rejection T diff --git a/tesseract-ocr/tessdata/configs/bigram b/tesseract-ocr/tessdata/configs/bigram new file mode 100644 index 0000000000000000000000000000000000000000..5d6c2d061f4a0bae8ab3b2270da8e6744a048d11 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/bigram @@ -0,0 +1,5 @@ +load_bigram_dawg True +tessedit_enable_bigram_correction True +tessedit_bigram_debug 3 +save_raw_choices True +save_alt_choices True diff --git a/tesseract-ocr/tessdata/configs/box.train b/tesseract-ocr/tessdata/configs/box.train new file mode 100644 index 0000000000000000000000000000000000000000..03cf5632695c4d6262e5646f998d43c0b676dcd5 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/box.train @@ -0,0 +1,14 @@ +disable_character_fragments T +file_type .bl +textord_fast_pitch_test T +tessedit_single_match 0 +tessedit_zero_rejection T +tessedit_minimal_rejection F +tessedit_write_rep_codes F +il1_adaption_test 1 +edges_children_fix F +edges_childarea 0.65 +edges_boxarea 0.9 +tessedit_resegment_from_boxes T +tessedit_train_from_boxes T +textord_no_rejects T diff --git a/tesseract-ocr/tessdata/configs/box.train.stderr b/tesseract-ocr/tessdata/configs/box.train.stderr new file mode 100644 index 0000000000000000000000000000000000000000..d44ff2b2c76c3590a77af44076b04e9edbd4ae39 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/box.train.stderr @@ -0,0 +1,15 @@ +file_type .bl +#tessedit_use_nn F +textord_fast_pitch_test T +tessedit_single_match 0 +tessedit_zero_rejection T +tessedit_minimal_rejection F +tessedit_write_rep_codes F +il1_adaption_test 1 +edges_children_fix F +edges_childarea 0.65 +edges_boxarea 0.9 +tessedit_resegment_from_boxes T +tessedit_train_from_boxes T +#textord_repeat_extraction F +textord_no_rejects T diff --git a/tesseract-ocr/tessdata/configs/digits b/tesseract-ocr/tessdata/configs/digits new file mode 100644 index 0000000000000000000000000000000000000000..6a329f892910ae9dd7af1f9fe8f7a1d48378fd8b --- /dev/null +++ b/tesseract-ocr/tessdata/configs/digits @@ -0,0 +1 @@ +tessedit_char_whitelist 0123456789-. diff --git a/tesseract-ocr/tessdata/configs/hocr b/tesseract-ocr/tessdata/configs/hocr new file mode 100644 index 0000000000000000000000000000000000000000..5ab372eaf819b05bdd87ba419c874f6a1be4677b --- /dev/null +++ b/tesseract-ocr/tessdata/configs/hocr @@ -0,0 +1,2 @@ +tessedit_create_hocr 1 +hocr_font_info 0 diff --git a/tesseract-ocr/tessdata/configs/inter b/tesseract-ocr/tessdata/configs/inter new file mode 100644 index 0000000000000000000000000000000000000000..252f1a171a154f9ade798e210015a720af039d00 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/inter @@ -0,0 +1,2 @@ +interactive_display_mode T +tessedit_display_outwords T diff --git a/tesseract-ocr/tessdata/configs/kannada b/tesseract-ocr/tessdata/configs/kannada new file mode 100644 index 0000000000000000000000000000000000000000..c6ac105788137bc4e89821e94843ea86ed5b4564 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/kannada @@ -0,0 +1,4 @@ +textord_skewsmooth_offset 8 +textord_skewsmooth_offset2 8 +textord_merge_desc 0.5 +textord_no_rejects 1 diff --git a/tesseract-ocr/tessdata/configs/linebox b/tesseract-ocr/tessdata/configs/linebox new file mode 100644 index 0000000000000000000000000000000000000000..bd9c114df65ddf13e640298075adb940225c5f96 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/linebox @@ -0,0 +1,2 @@ +tessedit_resegment_from_line_boxes 1 +tessedit_make_boxes_from_boxes 1 diff --git a/tesseract-ocr/tessdata/configs/logfile b/tesseract-ocr/tessdata/configs/logfile new file mode 100644 index 0000000000000000000000000000000000000000..a160f9be275a70fe3af1935fb8fe7af29efa8451 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/logfile @@ -0,0 +1 @@ +debug_file tesseract.log diff --git a/tesseract-ocr/tessdata/configs/lstm.train b/tesseract-ocr/tessdata/configs/lstm.train new file mode 100644 index 0000000000000000000000000000000000000000..3cb172d5e4095a08f3ce1482feb392efe85c94ab --- /dev/null +++ b/tesseract-ocr/tessdata/configs/lstm.train @@ -0,0 +1,13 @@ +disable_character_fragments T +file_type .bl +textord_fast_pitch_test T +tessedit_single_match 0 +tessedit_zero_rejection T +tessedit_minimal_rejection F +tessedit_write_rep_codes F +il1_adaption_test 1 +edges_children_fix F +edges_childarea 0.65 +edges_boxarea 0.9 +tessedit_train_line_recognizer T +textord_no_rejects T diff --git a/tesseract-ocr/tessdata/configs/lstmdebug b/tesseract-ocr/tessdata/configs/lstmdebug new file mode 100644 index 0000000000000000000000000000000000000000..3fa3dee71aafe30913c1863a5e67529872984743 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/lstmdebug @@ -0,0 +1,4 @@ +stopper_debug_level 1 +classify_debug_level 1 +segsearch_debug_level 1 +language_model_debug_level 3 diff --git a/tesseract-ocr/tessdata/configs/makebox b/tesseract-ocr/tessdata/configs/makebox new file mode 100644 index 0000000000000000000000000000000000000000..3d90ac26f9542c6beac1082b2d900859906af8e9 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/makebox @@ -0,0 +1 @@ +tessedit_create_boxfile 1 diff --git a/tesseract-ocr/tessdata/configs/pdf b/tesseract-ocr/tessdata/configs/pdf new file mode 100644 index 0000000000000000000000000000000000000000..59645d71ce52a143d819f2057c8c4e9ce2d46e40 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/pdf @@ -0,0 +1 @@ +tessedit_create_pdf 1 diff --git a/tesseract-ocr/tessdata/configs/quiet b/tesseract-ocr/tessdata/configs/quiet new file mode 100644 index 0000000000000000000000000000000000000000..35b59a9d41dd462c6d13b2301d4b2c31219c582f --- /dev/null +++ b/tesseract-ocr/tessdata/configs/quiet @@ -0,0 +1 @@ +debug_file /dev/null diff --git a/tesseract-ocr/tessdata/configs/rebox b/tesseract-ocr/tessdata/configs/rebox new file mode 100644 index 0000000000000000000000000000000000000000..f8342b4c2c7eb733e1d4078f32a0aa5aee677cc3 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/rebox @@ -0,0 +1,2 @@ +tessedit_resegment_from_boxes 1 +tessedit_make_boxes_from_boxes 1 diff --git a/tesseract-ocr/tessdata/configs/strokewidth b/tesseract-ocr/tessdata/configs/strokewidth new file mode 100644 index 0000000000000000000000000000000000000000..e95b59263daf6d43f1b20682a4fa79d386484536 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/strokewidth @@ -0,0 +1,12 @@ +textord_show_blobs 0 +textord_debug_tabfind 3 +textord_tabfind_show_partitions 1 +textord_tabfind_show_initial_partitions 1 +textord_tabfind_show_columns 1 +textord_tabfind_show_blocks 1 +textord_tabfind_show_initialtabs 1 +textord_tabfind_show_finaltabs 1 +textord_tabfind_show_strokewidths 1 +textord_tabfind_show_vlines 0 +textord_tabfind_show_images 1 +tessedit_dump_pageseg_images 0 diff --git a/tesseract-ocr/tessdata/configs/tsv b/tesseract-ocr/tessdata/configs/tsv new file mode 100644 index 0000000000000000000000000000000000000000..dc52478177fd6fb7b1fe278e1374c2054f3e2442 --- /dev/null +++ b/tesseract-ocr/tessdata/configs/tsv @@ -0,0 +1 @@ +tessedit_create_tsv 1 diff --git a/tesseract-ocr/tessdata/configs/txt b/tesseract-ocr/tessdata/configs/txt new file mode 100644 index 0000000000000000000000000000000000000000..5046f0b045fbd25f6fd63568e6514eb1c04fe05d --- /dev/null +++ b/tesseract-ocr/tessdata/configs/txt @@ -0,0 +1,3 @@ +# This config file should be used with other cofig files which creates renderers. +# usage example: tesseract eurotext.tif eurotext txt hocr pdf +tessedit_create_txt 1 diff --git a/tesseract-ocr/tessdata/configs/unlv b/tesseract-ocr/tessdata/configs/unlv new file mode 100644 index 0000000000000000000000000000000000000000..d2e22f5b93585032eef94f22966329144ba44d6f --- /dev/null +++ b/tesseract-ocr/tessdata/configs/unlv @@ -0,0 +1,2 @@ +tessedit_write_unlv 1 +unlv_tilde_crunching T diff --git a/tesseract-ocr/tessdata/eng.traineddata b/tesseract-ocr/tessdata/eng.traineddata new file mode 100644 index 0000000000000000000000000000000000000000..bbef4675053b5b468cdb477053e28b1c698ba08e Binary files /dev/null and b/tesseract-ocr/tessdata/eng.traineddata differ diff --git a/tesseract-ocr/tessdata/eng.user-patterns b/tesseract-ocr/tessdata/eng.user-patterns new file mode 100644 index 0000000000000000000000000000000000000000..5daba44df897f1c6d67caeb6d0414f7b55625fa1 --- /dev/null +++ b/tesseract-ocr/tessdata/eng.user-patterns @@ -0,0 +1,2 @@ +1-\d\d\d-GOOG-411 +www.\n\\\*.com diff --git a/tesseract-ocr/tessdata/eng.user-words b/tesseract-ocr/tessdata/eng.user-words new file mode 100644 index 0000000000000000000000000000000000000000..e0c5a630214ac69273e2b54107c62ed171fc50a0 --- /dev/null +++ b/tesseract-ocr/tessdata/eng.user-words @@ -0,0 +1,5 @@ +the +quick +brown +fox +jumped diff --git a/tesseract-ocr/tessdata/osd.traineddata b/tesseract-ocr/tessdata/osd.traineddata new file mode 100644 index 0000000000000000000000000000000000000000..527457ca8f8fe1fda7c2f88bce3c0e4be12be9d0 Binary files /dev/null and b/tesseract-ocr/tessdata/osd.traineddata differ diff --git a/tesseract-ocr/tessdata/pdf.ttf b/tesseract-ocr/tessdata/pdf.ttf new file mode 100644 index 0000000000000000000000000000000000000000..d1472b20ef1aebbf5e11573867e9ac13873681b9 Binary files /dev/null and b/tesseract-ocr/tessdata/pdf.ttf differ diff --git a/tesseract-ocr/tessdata/tessconfigs/batch b/tesseract-ocr/tessdata/tessconfigs/batch new file mode 100644 index 0000000000000000000000000000000000000000..a681e4a443fa21ce6f32bbcf0334af3433888566 --- /dev/null +++ b/tesseract-ocr/tessdata/tessconfigs/batch @@ -0,0 +1 @@ +# No content needed as all defaults are correct. diff --git a/tesseract-ocr/tessdata/tessconfigs/batch.nochop b/tesseract-ocr/tessdata/tessconfigs/batch.nochop new file mode 100644 index 0000000000000000000000000000000000000000..ebaab9438e309b4dfdfd8428676170ab2b64a858 --- /dev/null +++ b/tesseract-ocr/tessdata/tessconfigs/batch.nochop @@ -0,0 +1,2 @@ +chop_enable 0 +wordrec_enable_assoc 0 diff --git a/tesseract-ocr/tessdata/tessconfigs/matdemo b/tesseract-ocr/tessdata/tessconfigs/matdemo new file mode 100644 index 0000000000000000000000000000000000000000..c34567be7565d519806076b795fceff9fdad1477 --- /dev/null +++ b/tesseract-ocr/tessdata/tessconfigs/matdemo @@ -0,0 +1,7 @@ +################################################# +# Adaptive Matcher Using PreAdapted Templates +################################################# + +classify_enable_adaptive_debugger 1 +matcher_debug_flags 6 +matcher_debug_level 1 diff --git a/tesseract-ocr/tessdata/tessconfigs/msdemo b/tesseract-ocr/tessdata/tessconfigs/msdemo new file mode 100644 index 0000000000000000000000000000000000000000..a1af21fe61d03ffbac4915f695d9d9ad6e580ec0 --- /dev/null +++ b/tesseract-ocr/tessdata/tessconfigs/msdemo @@ -0,0 +1,13 @@ +################################################# +# Adaptive Matcher Using PreAdapted Templates +################################################# + +classify_enable_adaptive_debugger 1 +matcher_debug_flags 6 +matcher_debug_level 1 + +wordrec_display_splits 0 +wordrec_display_all_words 1 +wordrec_display_all_blobs 1 +wordrec_display_segmentations 2 +classify_debug_level 1 diff --git a/tesseract-ocr/tessdata/tessconfigs/nobatch b/tesseract-ocr/tessdata/tessconfigs/nobatch new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/tesseract-ocr/tessdata/tessconfigs/nobatch @@ -0,0 +1 @@ + diff --git a/tesseract-ocr/tessdata/tessconfigs/segdemo b/tesseract-ocr/tessdata/tessconfigs/segdemo new file mode 100644 index 0000000000000000000000000000000000000000..d7d90ae690542f6f9d9221043773cdb7165821a4 --- /dev/null +++ b/tesseract-ocr/tessdata/tessconfigs/segdemo @@ -0,0 +1,10 @@ +################################################# +# Adaptive Matcher Using PreAdapted Templates +################################################# + +wordrec_display_splits 0 +wordrec_display_all_words 1 +wordrec_display_all_blobs 1 +wordrec_display_segmentations 2 +classify_debug_level 1 +stopper_debug_level 1 diff --git a/tesseract-ocr/tesseract.exe b/tesseract-ocr/tesseract.exe new file mode 100644 index 0000000000000000000000000000000000000000..87e411995093865d270b3b699195a97465dfc27e Binary files /dev/null and b/tesseract-ocr/tesseract.exe differ diff --git a/tesseract-ocr/text2image.exe b/tesseract-ocr/text2image.exe new file mode 100644 index 0000000000000000000000000000000000000000..23dce4632b120ebb414d392ceb6f9e109a12d919 Binary files /dev/null and b/tesseract-ocr/text2image.exe differ diff --git a/tesseract-ocr/unicharset_extractor.exe b/tesseract-ocr/unicharset_extractor.exe new file mode 100644 index 0000000000000000000000000000000000000000..c6d62496446439c1cd140c169c88485ba97a6992 Binary files /dev/null and b/tesseract-ocr/unicharset_extractor.exe differ diff --git a/tesseract-ocr/wordlist2dawg.exe b/tesseract-ocr/wordlist2dawg.exe new file mode 100644 index 0000000000000000000000000000000000000000..dd63516ab8c6fe0b28f76ae8b2af568ad0dd78b2 Binary files /dev/null and b/tesseract-ocr/wordlist2dawg.exe differ diff --git a/tesseract-ocr/zlib1.dll b/tesseract-ocr/zlib1.dll new file mode 100644 index 0000000000000000000000000000000000000000..39d1e75dcd9c90fa2db379b635cff50eec0cdac2 Binary files /dev/null and b/tesseract-ocr/zlib1.dll differ