MikeTrizna commited on
Commit
f3270e6
·
verified ·
1 Parent(s): f2d0191

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -10
  2. README.md +41 -13
  3. app.py +126 -0
  4. backend/__pycache__/pytorch.cpython-312.pyc +0 -0
  5. backend/pytorch.py +97 -0
  6. backend/tensorflow.py +99 -0
  7. packages.txt +2 -0
  8. requirements.txt +2 -3
  9. src/python-doctr/.github/ISSUE_TEMPLATE/bug_report.yml +63 -0
  10. src/python-doctr/.github/ISSUE_TEMPLATE/config.yml +5 -0
  11. src/python-doctr/.github/ISSUE_TEMPLATE/feature_request.yml +33 -0
  12. src/python-doctr/.github/dependabot.yml +30 -0
  13. src/python-doctr/.github/release.yml +24 -0
  14. src/python-doctr/.github/verify_pr_labels.py +87 -0
  15. src/python-doctr/.github/workflows/builds.yml +43 -0
  16. src/python-doctr/.github/workflows/clear_caches.yml +15 -0
  17. src/python-doctr/.github/workflows/demo.yml +89 -0
  18. src/python-doctr/.github/workflows/doc-status.yml +22 -0
  19. src/python-doctr/.github/workflows/docker.yml +36 -0
  20. src/python-doctr/.github/workflows/docs.yml +51 -0
  21. src/python-doctr/.github/workflows/main.yml +90 -0
  22. src/python-doctr/.github/workflows/pr-labels.yml +29 -0
  23. src/python-doctr/.github/workflows/public_docker_images.yml +91 -0
  24. src/python-doctr/.github/workflows/publish.yml +65 -0
  25. src/python-doctr/.github/workflows/pull_requests.yml +32 -0
  26. src/python-doctr/.github/workflows/references.yml +253 -0
  27. src/python-doctr/.github/workflows/scripts.yml +121 -0
  28. src/python-doctr/.github/workflows/style.yml +55 -0
  29. src/python-doctr/.gitignore +140 -0
  30. src/python-doctr/.pre-commit-config.yaml +23 -0
  31. src/python-doctr/CODE_OF_CONDUCT.md +128 -0
  32. src/python-doctr/CONTRIBUTING.md +92 -0
  33. src/python-doctr/Dockerfile +46 -0
  34. src/python-doctr/LICENSE +201 -0
  35. src/python-doctr/Makefile +29 -0
  36. src/python-doctr/README.md +349 -0
  37. src/python-doctr/api/.gitignore +2 -0
  38. src/python-doctr/api/Dockerfile +25 -0
  39. src/python-doctr/api/Makefile +26 -0
  40. src/python-doctr/api/README.md +194 -0
  41. src/python-doctr/api/app/config.py +13 -0
  42. src/python-doctr/api/app/main.py +48 -0
  43. src/python-doctr/api/app/routes/detection.py +35 -0
  44. src/python-doctr/api/app/routes/kie.py +53 -0
  45. src/python-doctr/api/app/routes/ocr.py +66 -0
  46. src/python-doctr/api/app/routes/recognition.py +29 -0
  47. src/python-doctr/api/app/schemas.py +186 -0
  48. src/python-doctr/api/app/utils.py +47 -0
  49. src/python-doctr/api/app/vision.py +53 -0
  50. src/python-doctr/api/docker-compose.yml +9 -0
.gitattributes CHANGED
@@ -1,35 +1,34 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
 
20
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ src/python-doctr/docs/images/demo_illustration_mini.png filter=lfs diff=lfs merge=lfs -text
29
+ src/python-doctr/docs/images/demo_update.png filter=lfs diff=lfs merge=lfs -text
30
+ src/python-doctr/docs/images/doctr-need-help.png filter=lfs diff=lfs merge=lfs -text
31
+ src/python-doctr/docs/images/doctr_demo_app.png filter=lfs diff=lfs merge=lfs -text
32
+ src/python-doctr/docs/images/ocr.png filter=lfs diff=lfs merge=lfs -text
33
+ src/python-doctr/docs/source/_static/images/favicon.ico filter=lfs diff=lfs merge=lfs -text
34
+ src/python-doctr/doctr/datasets/__pycache__/vocabs.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,19 +1,47 @@
1
  ---
2
- title: Doctr Demo Fork
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
  pinned: false
11
- short_description: A fork of https://huggingface.co/spaces/mindee/doctr
12
  ---
13
 
14
- # Welcome to Streamlit!
15
 
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
 
17
 
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: docTR
3
+ emoji: 📑
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: streamlit
7
+ sdk_version: 1.39.0
8
+ app_file: app.py
 
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ ## Configuration
14
 
15
+ `title`: _string_
16
+ Display title for the Space
17
 
18
+ `emoji`: _string_
19
+ Space emoji (emoji-only character allowed)
20
+
21
+ `colorFrom`: _string_
22
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
23
+
24
+ `colorTo`: _string_
25
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
26
+
27
+ `sdk`: _string_
28
+ Can be either `gradio` or `streamlit`
29
+
30
+ `sdk_version` : _string_
31
+ Only applicable for `streamlit` SDK.
32
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
33
+
34
+ `app_file`: _string_
35
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code).
36
+ Path is relative to the root of the repository.
37
+
38
+ `pinned`: _boolean_
39
+ Whether the Space stays on top of your list.
40
+
41
+ ## Run the demo locally
42
+
43
+ ```bash
44
+ cd demo
45
+ pip install -r pt-requirements.txt
46
+ streamlit run app.py
47
+ ```
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ import cv2
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import streamlit as st
10
+ import torch
11
+ from backend.pytorch import DET_ARCHS, RECO_ARCHS, forward_image, load_predictor
12
+
13
+ from doctr.io import DocumentFile
14
+ from doctr.utils.visualization import visualize_page
15
+
16
+ forward_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
17
+
18
+
19
+ def main(det_archs, reco_archs):
20
+ """Build a streamlit layout"""
21
+ # Wide mode
22
+ st.set_page_config(layout="wide")
23
+
24
+ # Designing the interface
25
+ st.title("docTR: Document Text Recognition")
26
+ # For newline
27
+ st.write("\n")
28
+ # Instructions
29
+ st.markdown("*Hint: click on the top-right corner of an image to enlarge it!*")
30
+ # Set the columns
31
+ cols = st.columns((1, 1, 1, 1))
32
+ cols[0].subheader("Input page")
33
+ cols[1].subheader("Segmentation heatmap")
34
+ cols[2].subheader("OCR output")
35
+ cols[3].subheader("Page reconstitution")
36
+
37
+ # Sidebar
38
+ # File selection
39
+ st.sidebar.title("Document selection")
40
+ # Choose your own image
41
+ uploaded_file = st.sidebar.file_uploader("Upload files", type=["pdf", "png", "jpeg", "jpg"])
42
+ if uploaded_file is not None:
43
+ if uploaded_file.name.endswith(".pdf"):
44
+ doc = DocumentFile.from_pdf(uploaded_file.read())
45
+ else:
46
+ doc = DocumentFile.from_images(uploaded_file.read())
47
+ page_idx = st.sidebar.selectbox("Page selection", [idx + 1 for idx in range(len(doc))]) - 1
48
+ page = doc[page_idx]
49
+ cols[0].image(page)
50
+
51
+ # Model selection
52
+ st.sidebar.title("Model selection")
53
+ st.sidebar.markdown("**Backend**: PyTorch")
54
+ det_arch = st.sidebar.selectbox("Text detection model", det_archs)
55
+ reco_arch = st.sidebar.selectbox("Text recognition model", reco_archs)
56
+
57
+ # For newline
58
+ st.sidebar.write("\n")
59
+ # Only straight pages or possible rotation
60
+ st.sidebar.title("Parameters")
61
+ assume_straight_pages = st.sidebar.checkbox("Assume straight pages", value=True)
62
+ # Disable page orientation detection
63
+ disable_page_orientation = st.sidebar.checkbox("Disable page orientation detection", value=False)
64
+ # Disable crop orientation detection
65
+ disable_crop_orientation = st.sidebar.checkbox("Disable crop orientation detection", value=False)
66
+ # Straighten pages
67
+ straighten_pages = st.sidebar.checkbox("Straighten pages", value=False)
68
+ # Export as straight boxes
69
+ export_straight_boxes = st.sidebar.checkbox("Export as straight boxes", value=False)
70
+ st.sidebar.write("\n")
71
+ # Binarization threshold
72
+ bin_thresh = st.sidebar.slider("Binarization threshold", min_value=0.1, max_value=0.9, value=0.3, step=0.1)
73
+ st.sidebar.write("\n")
74
+ # Box threshold
75
+ box_thresh = st.sidebar.slider("Box threshold", min_value=0.1, max_value=0.9, value=0.1, step=0.1)
76
+ st.sidebar.write("\n")
77
+
78
+ if st.sidebar.button("Analyze page"):
79
+ if uploaded_file is None:
80
+ st.sidebar.write("Please upload a document")
81
+
82
+ else:
83
+ with st.spinner("Loading model..."):
84
+ predictor = load_predictor(
85
+ det_arch=det_arch,
86
+ reco_arch=reco_arch,
87
+ assume_straight_pages=assume_straight_pages,
88
+ straighten_pages=straighten_pages,
89
+ export_as_straight_boxes=export_straight_boxes,
90
+ disable_page_orientation=disable_page_orientation,
91
+ disable_crop_orientation=disable_crop_orientation,
92
+ bin_thresh=bin_thresh,
93
+ box_thresh=box_thresh,
94
+ device=forward_device,
95
+ )
96
+
97
+ with st.spinner("Analyzing..."):
98
+ # Forward the image to the model
99
+ seg_map = forward_image(predictor, page, forward_device)
100
+ seg_map = np.squeeze(seg_map)
101
+ seg_map = cv2.resize(seg_map, (page.shape[1], page.shape[0]), interpolation=cv2.INTER_LINEAR)
102
+
103
+ # Plot the raw heatmap
104
+ fig, ax = plt.subplots()
105
+ ax.imshow(seg_map)
106
+ ax.axis("off")
107
+ cols[1].pyplot(fig)
108
+
109
+ # Plot OCR output
110
+ out = predictor([page])
111
+ fig = visualize_page(out.pages[0].export(), out.pages[0].page, interactive=False, add_labels=False)
112
+ cols[2].pyplot(fig)
113
+
114
+ # Page reconsitution under input page
115
+ page_export = out.pages[0].export()
116
+ if assume_straight_pages or (not assume_straight_pages and straighten_pages):
117
+ img = out.pages[0].synthesize()
118
+ cols[3].image(img, clamp=True)
119
+
120
+ # Display JSON
121
+ st.markdown("\nHere are your analysis results in JSON format:")
122
+ st.json(page_export, expanded=False)
123
+
124
+
125
+ if __name__ == "__main__":
126
+ main(DET_ARCHS, RECO_ARCHS)
backend/__pycache__/pytorch.cpython-312.pyc ADDED
Binary file (3.46 kB). View file
 
backend/pytorch.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ import numpy as np
7
+ import torch
8
+
9
+ from doctr.models import ocr_predictor
10
+ from doctr.models.predictor import OCRPredictor
11
+
12
+ DET_ARCHS = [
13
+ "fast_base",
14
+ "fast_small",
15
+ "fast_tiny",
16
+ "db_resnet50",
17
+ "db_resnet34",
18
+ "db_mobilenet_v3_large",
19
+ "linknet_resnet18",
20
+ "linknet_resnet34",
21
+ "linknet_resnet50",
22
+ ]
23
+ RECO_ARCHS = [
24
+ "crnn_vgg16_bn",
25
+ "crnn_mobilenet_v3_small",
26
+ "crnn_mobilenet_v3_large",
27
+ "master",
28
+ "sar_resnet31",
29
+ "vitstr_small",
30
+ "vitstr_base",
31
+ "parseq",
32
+ "viptr_tiny",
33
+ ]
34
+
35
+
36
+ def load_predictor(
37
+ det_arch: str,
38
+ reco_arch: str,
39
+ assume_straight_pages: bool,
40
+ straighten_pages: bool,
41
+ export_as_straight_boxes: bool,
42
+ disable_page_orientation: bool,
43
+ disable_crop_orientation: bool,
44
+ bin_thresh: float,
45
+ box_thresh: float,
46
+ device: torch.device,
47
+ ) -> OCRPredictor:
48
+ """Load a predictor from doctr.models
49
+
50
+ Args:
51
+ det_arch: detection architecture
52
+ reco_arch: recognition architecture
53
+ assume_straight_pages: whether to assume straight pages or not
54
+ straighten_pages: whether to straighten rotated pages or not
55
+ export_as_straight_boxes: whether to export boxes as straight or not
56
+ disable_page_orientation: whether to disable page orientation or not
57
+ disable_crop_orientation: whether to disable crop orientation or not
58
+ bin_thresh: binarization threshold for the segmentation map
59
+ box_thresh: minimal objectness score to consider a box
60
+ device: torch.device, the device to load the predictor on
61
+
62
+ Returns:
63
+ instance of OCRPredictor
64
+ """
65
+ predictor = ocr_predictor(
66
+ det_arch,
67
+ reco_arch,
68
+ pretrained=True,
69
+ assume_straight_pages=assume_straight_pages,
70
+ straighten_pages=straighten_pages,
71
+ export_as_straight_boxes=export_as_straight_boxes,
72
+ detect_orientation=not assume_straight_pages,
73
+ disable_page_orientation=disable_page_orientation,
74
+ disable_crop_orientation=disable_crop_orientation,
75
+ ).to(device)
76
+ predictor.det_predictor.model.postprocessor.bin_thresh = bin_thresh
77
+ predictor.det_predictor.model.postprocessor.box_thresh = box_thresh
78
+ return predictor
79
+
80
+
81
+ def forward_image(predictor: OCRPredictor, image: np.ndarray, device: torch.device) -> np.ndarray:
82
+ """Forward an image through the predictor
83
+
84
+ Args:
85
+ predictor: instance of OCRPredictor
86
+ image: image to process
87
+ device: torch.device, the device to process the image on
88
+
89
+ Returns:
90
+ segmentation map
91
+ """
92
+ with torch.no_grad():
93
+ processed_batches = predictor.det_predictor.pre_processor([image])
94
+ out = predictor.det_predictor.model(processed_batches[0].to(device), return_model_output=True)
95
+ seg_map = out["out_map"].to("cpu").numpy()
96
+
97
+ return seg_map
backend/tensorflow.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ import numpy as np
7
+ import tensorflow as tf
8
+
9
+ from doctr.models import ocr_predictor
10
+ from doctr.models.predictor import OCRPredictor
11
+
12
+ DET_ARCHS = [
13
+ "fast_base",
14
+ "fast_small",
15
+ "fast_tiny",
16
+ "db_resnet50",
17
+ "db_mobilenet_v3_large",
18
+ "linknet_resnet18",
19
+ "linknet_resnet34",
20
+ "linknet_resnet50",
21
+ ]
22
+ RECO_ARCHS = [
23
+ "crnn_vgg16_bn",
24
+ "crnn_mobilenet_v3_small",
25
+ "crnn_mobilenet_v3_large",
26
+ "master",
27
+ "sar_resnet31",
28
+ "vitstr_small",
29
+ "vitstr_base",
30
+ "parseq",
31
+ ]
32
+
33
+
34
+ def load_predictor(
35
+ det_arch: str,
36
+ reco_arch: str,
37
+ assume_straight_pages: bool,
38
+ straighten_pages: bool,
39
+ export_as_straight_boxes: bool,
40
+ disable_page_orientation: bool,
41
+ disable_crop_orientation: bool,
42
+ bin_thresh: float,
43
+ box_thresh: float,
44
+ device: tf.device,
45
+ ) -> OCRPredictor:
46
+ """Load a predictor from doctr.models
47
+
48
+ Args:
49
+ det_arch: detection architecture
50
+ reco_arch: recognition architecture
51
+ assume_straight_pages: whether to assume straight pages or not
52
+ straighten_pages: whether to straighten rotated pages or not
53
+ export_as_straight_boxes: whether to export boxes as straight or not
54
+ disable_page_orientation: whether to disable page orientation or not
55
+ disable_crop_orientation: whether to disable crop orientation or not
56
+ bin_thresh: binarization threshold for the segmentation map
57
+ box_thresh: threshold for the detection boxes
58
+ device: tf.device, the device to load the predictor on
59
+
60
+ Returns:
61
+ instance of OCRPredictor
62
+ """
63
+ with device:
64
+ predictor = ocr_predictor(
65
+ det_arch,
66
+ reco_arch,
67
+ pretrained=True,
68
+ assume_straight_pages=assume_straight_pages,
69
+ straighten_pages=straighten_pages,
70
+ export_as_straight_boxes=export_as_straight_boxes,
71
+ detect_orientation=not assume_straight_pages,
72
+ disable_page_orientation=disable_page_orientation,
73
+ disable_crop_orientation=disable_crop_orientation,
74
+ )
75
+ predictor.det_predictor.model.postprocessor.bin_thresh = bin_thresh
76
+ predictor.det_predictor.model.postprocessor.box_thresh = box_thresh
77
+ return predictor
78
+
79
+
80
+ def forward_image(predictor: OCRPredictor, image: np.ndarray, device: tf.device) -> np.ndarray:
81
+ """Forward an image through the predictor
82
+
83
+ Args:
84
+ predictor: instance of OCRPredictor
85
+ image: image to process as numpy array
86
+ device: tf.device, the device to process the image on
87
+
88
+ Returns:
89
+ segmentation map
90
+ """
91
+ with device:
92
+ processed_batches = predictor.det_predictor.pre_processor([image])
93
+ out = predictor.det_predictor.model(processed_batches[0], return_model_output=True)
94
+ seg_map = out["out_map"]
95
+
96
+ with tf.device("/cpu:0"):
97
+ seg_map = tf.identity(seg_map).numpy()
98
+
99
+ return seg_map
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python3-opencv
2
+ fonts-freefont-ttf
requirements.txt CHANGED
@@ -1,3 +1,2 @@
1
- altair
2
- pandas
3
- streamlit
 
1
+ -e git+https://github.com/mindee/doctr.git#egg=python-doctr[viz]
2
+ streamlit>=1.0.0
 
src/python-doctr/.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 🐛 Bug report
2
+ description: Create a report to help us improve the library
3
+ labels: 'type: bug'
4
+
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: >
9
+ #### Before reporting a bug, please check that the issue hasn't already been addressed in [the existing and past issues](https://github.com/mindee/doctr/issues?q=is%3Aissue).
10
+ - type: textarea
11
+ attributes:
12
+ label: Bug description
13
+ description: |
14
+ A clear and concise description of what the bug is.
15
+
16
+ Please explain the result you observed and the behavior you were expecting.
17
+ placeholder: |
18
+ A clear and concise description of what the bug is.
19
+ validations:
20
+ required: true
21
+
22
+ - type: textarea
23
+ attributes:
24
+ label: Code snippet to reproduce the bug
25
+ description: |
26
+ Sample code to reproduce the problem.
27
+
28
+ Please wrap your code snippet with ```` ```triple quotes blocks``` ```` for readability.
29
+ placeholder: |
30
+ ```python
31
+ Sample code to reproduce the problem
32
+ ```
33
+ validations:
34
+ required: true
35
+ - type: textarea
36
+ attributes:
37
+ label: Error traceback
38
+ description: |
39
+ The error message you received running the code snippet, with the full traceback.
40
+
41
+ Please wrap your error message with ```` ```triple quotes blocks``` ```` for readability.
42
+ placeholder: |
43
+ ```
44
+ The error message you got, with the full traceback.
45
+ ```
46
+ validations:
47
+ required: true
48
+ - type: textarea
49
+ attributes:
50
+ label: Environment
51
+ description: |
52
+ Please run the following command and paste the output below.
53
+ ```sh
54
+ wget https://raw.githubusercontent.com/mindee/doctr/main/scripts/collect_env.py
55
+ # For security purposes, please check the contents of collect_env.py before running it.
56
+ python collect_env.py
57
+ ```
58
+ validations:
59
+ required: true
60
+ - type: markdown
61
+ attributes:
62
+ value: >
63
+ Thanks for helping us improve the library!
src/python-doctr/.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ blank_issues_enabled: true
2
+ contact_links:
3
+ - name: Usage questions
4
+ url: https://github.com/mindee/doctr/discussions
5
+ about: Ask questions and discuss with other docTR community members
src/python-doctr/.github/ISSUE_TEMPLATE/feature_request.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 🚀 Feature request
2
+ description: Submit a proposal/request for a new feature for docTR
3
+ labels: 'type: enhancement'
4
+
5
+ body:
6
+ - type: textarea
7
+ attributes:
8
+ label: 🚀 The feature
9
+ description: >
10
+ A clear and concise description of the feature proposal
11
+ validations:
12
+ required: true
13
+ - type: textarea
14
+ attributes:
15
+ label: Motivation, pitch
16
+ description: >
17
+ Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
18
+ validations:
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Alternatives
23
+ description: >
24
+ A description of any alternative solutions or features you've considered, if any.
25
+ - type: textarea
26
+ attributes:
27
+ label: Additional context
28
+ description: >
29
+ Add any other context or screenshots about the feature request.
30
+ - type: markdown
31
+ attributes:
32
+ value: >
33
+ Thanks for contributing 🎉
src/python-doctr/.github/dependabot.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "pip"
4
+ directory: "/"
5
+ open-pull-requests-limit: 10
6
+ target-branch: "main"
7
+ labels: ["topic: build"]
8
+ schedule:
9
+ interval: weekly
10
+ day: sunday
11
+ reviewers:
12
+ - "charlesmindee"
13
+ - "felixdittrich92"
14
+ - "odulcy-mindee"
15
+ - package-ecosystem: "github-actions"
16
+ directory: "/"
17
+ open-pull-requests-limit: 10
18
+ target-branch: "main"
19
+ labels: ["topic: ci"]
20
+ schedule:
21
+ interval: weekly
22
+ day: sunday
23
+ reviewers:
24
+ - "charlesmindee"
25
+ - "felixdittrich92"
26
+ - "odulcy-mindee"
27
+ groups:
28
+ github-actions:
29
+ patterns:
30
+ - "*"
src/python-doctr/.github/release.yml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ changelog:
2
+ exclude:
3
+ labels:
4
+ - ignore-for-release
5
+ categories:
6
+ - title: Breaking Changes 🛠
7
+ labels:
8
+ - "type: breaking change"
9
+ # NEW FEATURES
10
+ - title: New Features
11
+ labels:
12
+ - "type: new feature"
13
+ # BUG FIXES
14
+ - title: Bug Fixes
15
+ labels:
16
+ - "type: bug"
17
+ # IMPROVEMENTS
18
+ - title: Improvements
19
+ labels:
20
+ - "type: enhancement"
21
+ # MISC
22
+ - title: Miscellaneous
23
+ labels:
24
+ - "type: misc"
src/python-doctr/.github/verify_pr_labels.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ """Borrowed & adapted from https://github.com/pytorch/vision/blob/main/.github/process_commit.py
7
+ This script finds the merger responsible for labeling a PR by a commit SHA. It is used by the workflow in
8
+ '.github/workflows/pr-labels.yml'. If there exists no PR associated with the commit or the PR is properly labeled,
9
+ this script is a no-op.
10
+ Note: we ping the merger only, not the reviewers, as the reviewers can sometimes be external to torchvision
11
+ with no labeling responsibility, so we don't want to bother them.
12
+ """
13
+
14
+ from typing import Any
15
+
16
+ import requests
17
+
18
+ # For a PR to be properly labeled it should have one primary label and one secondary label
19
+
20
+ # Should specify the type of change
21
+ PRIMARY_LABELS = {
22
+ "type: new feature",
23
+ "type: bug",
24
+ "type: enhancement",
25
+ "type: misc",
26
+ }
27
+
28
+ # Should specify what has been modified
29
+ SECONDARY_LABELS = {
30
+ "topic: documentation",
31
+ "module: datasets",
32
+ "module: io",
33
+ "module: models",
34
+ "module: transforms",
35
+ "module: utils",
36
+ "ext: api",
37
+ "ext: demo",
38
+ "ext: docs",
39
+ "ext: notebooks",
40
+ "ext: references",
41
+ "ext: scripts",
42
+ "ext: tests",
43
+ "topic: build",
44
+ "topic: ci",
45
+ "topic: docker",
46
+ }
47
+
48
+ GH_ORG = "mindee"
49
+ GH_REPO = "doctr"
50
+
51
+
52
+ def query_repo(cmd: str, *, accept) -> Any:
53
+ response = requests.get(f"https://api.github.com/repos/{GH_ORG}/{GH_REPO}/{cmd}", headers=dict(Accept=accept))
54
+ return response.json()
55
+
56
+
57
+ def get_pr_merger_and_labels(pr_number: int) -> tuple[str, set[str]]:
58
+ # See https://docs.github.com/en/rest/reference/pulls#get-a-pull-request
59
+ data = query_repo(f"pulls/{pr_number}", accept="application/vnd.github.v3+json")
60
+ merger = data.get("merged_by", {}).get("login")
61
+ labels = {label["name"] for label in data["labels"]}
62
+ return merger, labels
63
+
64
+
65
+ def main(args):
66
+ merger, labels = get_pr_merger_and_labels(args.pr)
67
+ is_properly_labeled = bool(PRIMARY_LABELS.intersection(labels) and SECONDARY_LABELS.intersection(labels))
68
+ if isinstance(merger, str) and not is_properly_labeled:
69
+ print(f"@{merger}")
70
+
71
+
72
+ def parse_args():
73
+ import argparse
74
+
75
+ parser = argparse.ArgumentParser(
76
+ description="PR label checker", formatter_class=argparse.ArgumentDefaultsHelpFormatter
77
+ )
78
+
79
+ parser.add_argument("pr", type=int, help="PR number")
80
+ args = parser.parse_args()
81
+
82
+ return args
83
+
84
+
85
+ if __name__ == "__main__":
86
+ args = parse_args()
87
+ main(args)
src/python-doctr/.github/workflows/builds.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: builds
2
+
3
+ on:
4
+ push:
5
+ branches: main
6
+ pull_request:
7
+ branches: main
8
+
9
+ jobs:
10
+ build:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ os: [ubuntu-latest, macos-latest, windows-latest]
16
+ python: ["3.10", "3.11"]
17
+ steps:
18
+ - uses: actions/checkout@v5
19
+ - if: matrix.os == 'macos-latest'
20
+ name: Install MacOS prerequisites
21
+ run: brew install cairo pango gdk-pixbuf libffi
22
+ - name: Set up Python
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ # MacOS issue ref.: https://github.com/actions/setup-python/issues/855 & https://github.com/actions/setup-python/issues/865
26
+ python-version: ${{ matrix.os == 'macos-latest' && matrix.python == '3.10' && '3.11' || matrix.python }}
27
+ architecture: x64
28
+ - name: Cache python modules
29
+ uses: actions/cache@v4
30
+ with:
31
+ path: ~/.cache/pip
32
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}
33
+ - name: Install package
34
+ run: |
35
+ python -m pip install --upgrade pip
36
+ if [ "${{ runner.os }}" = "Windows" ]; then
37
+ pip install -e .[viz] --upgrade
38
+ else
39
+ pip install -e .[viz,html] --upgrade
40
+ fi
41
+ shell: bash # Ensures shell is consistent across OSes
42
+ - name: Import package
43
+ run: python -c "import doctr; print(doctr.__version__)"
src/python-doctr/.github/workflows/clear_caches.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Clear GitHub runner caches
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ schedule:
6
+ - cron: '0 0 * * *' # Runs once a day
7
+
8
+ jobs:
9
+ clear:
10
+ name: Clear caches
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: MyAlbum/purge-cache@v2
14
+ with:
15
+ max-age: 172800 # Caches older than 2 days are deleted
src/python-doctr/.github/workflows/demo.yml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: demo
2
+
3
+ on:
4
+ # Run 'test-demo' on every pull request to the main branch
5
+ pull_request:
6
+ branches: [main]
7
+
8
+ # Run 'test-demo' on every push to the main branch or both jobs when a new version tag is pushed
9
+ push:
10
+ branches:
11
+ - main
12
+ tags:
13
+ - 'v*'
14
+
15
+ # Run 'sync-to-hub' on a scheduled cron job
16
+ schedule:
17
+ - cron: '0 2 10 * *' # At 02:00 on day-of-month 10 (every month)
18
+
19
+ # Allow manual triggering of the workflow
20
+ workflow_dispatch:
21
+
22
+ jobs:
23
+ test-demo:
24
+ runs-on: ${{ matrix.os }}
25
+ strategy:
26
+ fail-fast: false
27
+ matrix:
28
+ os: [ubuntu-latest]
29
+ python: ["3.10"]
30
+ steps:
31
+ - if: matrix.os == 'macos-latest'
32
+ name: Install MacOS prerequisites
33
+ run: brew install cairo pango gdk-pixbuf libffi
34
+ - uses: actions/checkout@v5
35
+ - name: Set up Python
36
+ uses: actions/setup-python@v5
37
+ with:
38
+ python-version: ${{ matrix.python }}
39
+ architecture: x64
40
+ - name: Cache python modules
41
+ uses: actions/cache@v4
42
+ with:
43
+ path: ~/.cache/pip
44
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('demo/pt-requirements.txt') }}
45
+ - name: Install dependencies
46
+ run: |
47
+ python -m pip install --upgrade pip
48
+ pip install -e .[viz,html] --upgrade
49
+ pip install -r demo/pt-requirements.txt
50
+ - name: Run demo
51
+ run: |
52
+ streamlit --version
53
+ screen -dm streamlit run demo/app.py
54
+ sleep 10
55
+ curl http://localhost:8501/docs
56
+
57
+ # This job only runs when a new version tag is pushed or during the cron job or when manually triggered
58
+ sync-to-hub:
59
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
60
+ needs: test-demo
61
+ runs-on: ${{ matrix.os }}
62
+ strategy:
63
+ fail-fast: false
64
+ matrix:
65
+ os: [ubuntu-latest]
66
+ python: ["3.10"]
67
+ steps:
68
+ - uses: actions/checkout@v5
69
+ with:
70
+ fetch-depth: 0
71
+ - name: Set up Python
72
+ uses: actions/setup-python@v5
73
+ with:
74
+ python-version: ${{ matrix.python }}
75
+ architecture: x64
76
+ - name: Install huggingface_hub
77
+ run: pip install huggingface-hub
78
+ - name: Upload folder to Hugging Face
79
+ # Only keep the requirements.txt file for the demo (PyTorch)
80
+ run: |
81
+ mv demo/pt-requirements.txt demo/requirements.txt
82
+
83
+ python -c "
84
+ from huggingface_hub import HfApi
85
+ api = HfApi(token='${{ secrets.HF_TOKEN }}')
86
+ repo_id = 'mindee/doctr'
87
+ api.upload_folder(repo_id=repo_id, repo_type='space', folder_path='demo/')
88
+ api.restart_space(repo_id=repo_id, factory_reboot=True)
89
+ "
src/python-doctr/.github/workflows/doc-status.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: doc-status
2
+ on:
3
+ page_build
4
+
5
+ jobs:
6
+ see-page-build-payload:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - name: Set up Python
10
+ uses: actions/setup-python@v5
11
+ with:
12
+ python-version: "3.10"
13
+ architecture: x64
14
+ - name: check status
15
+ run: |
16
+ import os
17
+ status, errormsg = os.getenv('STATUS'), os.getenv('ERROR')
18
+ if status != 'built': raise AssertionError(f"There was an error building the page on GitHub pages.\n\nStatus: {status}\n\nError messsage: {errormsg}")
19
+ shell: python
20
+ env:
21
+ STATUS: ${{ github.event.build.status }}
22
+ ERROR: ${{ github.event.build.error.message }}
src/python-doctr/.github/workflows/docker.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: docker
2
+
3
+ on:
4
+ push:
5
+ branches: main
6
+ pull_request:
7
+ branches: main
8
+
9
+ jobs:
10
+ docker-package:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v5
14
+ - name: Build docker image
15
+ run: docker build -t doctr-py3.10-slim --build-arg SYSTEM=cpu .
16
+ - name: Run docker container
17
+ run: docker run doctr-py3.10-slim python3 -c 'import doctr'
18
+
19
+ pytest-api:
20
+ runs-on: ${{ matrix.os }}
21
+ strategy:
22
+ matrix:
23
+ os: [ubuntu-latest]
24
+ python: ["3.10"]
25
+ steps:
26
+ - uses: actions/checkout@v5
27
+ - uses: actions/setup-python@v5
28
+ with:
29
+ python-version: ${{ matrix.python }}
30
+ architecture: x64
31
+ - name: Build & run docker
32
+ run: cd api && make lock && make run
33
+ - name: Ping server
34
+ run: wget --spider --tries=12 http://localhost:8080/docs
35
+ - name: Run docker test
36
+ run: cd api && make test
src/python-doctr/.github/workflows/docs.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: docs
2
+ on:
3
+ push:
4
+ branches: main
5
+
6
+ jobs:
7
+ docs-deploy:
8
+ runs-on: ${{ matrix.os }}
9
+ strategy:
10
+ matrix:
11
+ os: [ubuntu-latest]
12
+ python: ["3.10"]
13
+ steps:
14
+ - uses: actions/checkout@v5
15
+ with:
16
+ persist-credentials: false
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: ${{ matrix.python }}
21
+ architecture: x64
22
+ - name: Cache python modules
23
+ uses: actions/cache@v4
24
+ with:
25
+ path: ~/.cache/pip
26
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-docs
27
+ - name: Install dependencies
28
+ run: |
29
+ python -m pip install --upgrade pip
30
+ pip install -e .[viz,html] --upgrade
31
+ pip install -e .[docs]
32
+
33
+ - name: Build documentation
34
+ run: cd docs && bash build.sh
35
+
36
+ - name: Documentation sanity check
37
+ run: test -e docs/build/index.html || exit
38
+
39
+ - name: Install SSH Client 🔑
40
+ uses: webfactory/ssh-agent@v0.9.1
41
+ with:
42
+ ssh-private-key: ${{ secrets.SSH_DEPLOY_KEY }}
43
+
44
+ - name: Deploy to Github Pages
45
+ uses: JamesIves/github-pages-deploy-action@v4.7.3
46
+ with:
47
+ BRANCH: gh-pages
48
+ FOLDER: 'docs/build'
49
+ COMMIT_MESSAGE: '[skip ci] Documentation updates'
50
+ CLEAN: true
51
+ SSH: true
src/python-doctr/.github/workflows/main.yml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: tests
2
+
3
+ on:
4
+ push:
5
+ branches: main
6
+ pull_request:
7
+ branches: main
8
+
9
+ jobs:
10
+ pytest-common:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ matrix:
14
+ os: [ubuntu-latest]
15
+ python: ["3.10"]
16
+ steps:
17
+ - uses: actions/checkout@v5
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: ${{ matrix.python }}
22
+ architecture: x64
23
+ - name: Cache python modules
24
+ uses: actions/cache@v4
25
+ with:
26
+ path: ~/.cache/pip
27
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-tests
28
+ - name: Install dependencies
29
+ run: |
30
+ python -m pip install --upgrade pip
31
+ pip install -e .[viz,html] --upgrade
32
+ pip install -e .[testing]
33
+ - name: Run unittests
34
+ run: |
35
+ coverage run -m pytest tests/common/ -rs
36
+ coverage xml -o coverage-common.xml
37
+ - uses: actions/upload-artifact@v4
38
+ with:
39
+ name: coverage-common
40
+ path: ./coverage-common.xml
41
+ if-no-files-found: error
42
+
43
+
44
+ pytest-torch:
45
+ runs-on: ${{ matrix.os }}
46
+ strategy:
47
+ matrix:
48
+ os: [ubuntu-latest]
49
+ python: ["3.10"]
50
+ steps:
51
+ - uses: actions/checkout@v5
52
+ - name: Set up Python
53
+ uses: actions/setup-python@v5
54
+ with:
55
+ python-version: ${{ matrix.python }}
56
+ architecture: x64
57
+ - name: Cache python modules
58
+ uses: actions/cache@v4
59
+ with:
60
+ path: ~/.cache/pip
61
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-tests
62
+ - name: Install dependencies
63
+ run: |
64
+ python -m pip install --upgrade pip
65
+ pip install -e .[viz,html] --upgrade
66
+ pip install -e .[testing]
67
+
68
+ - name: Run unittests
69
+ run: |
70
+ coverage run -m pytest tests/pytorch/ -rs
71
+ coverage xml -o coverage-pt.xml
72
+
73
+ - uses: actions/upload-artifact@v4
74
+ with:
75
+ name: coverage-pytorch
76
+ path: ./coverage-pt.xml
77
+ if-no-files-found: error
78
+
79
+ codecov-upload:
80
+ runs-on: ubuntu-latest
81
+ needs: [ pytest-common, pytest-torch ]
82
+ steps:
83
+ - uses: actions/checkout@v5
84
+ - uses: actions/download-artifact@v5
85
+ - name: Upload coverage to Codecov
86
+ uses: codecov/codecov-action@v5
87
+ with:
88
+ flags: unittests
89
+ fail_ci_if_error: true
90
+ token: ${{ secrets.CODECOV_TOKEN }}
src/python-doctr/.github/workflows/pr-labels.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: pr-labels
2
+
3
+ on:
4
+ pull_request:
5
+ branches: main
6
+ types: closed
7
+
8
+ jobs:
9
+ is-properly-labeled:
10
+ if: github.event.pull_request.merged == true
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Checkout repository
14
+ uses: actions/checkout@v5
15
+ - name: Set up python
16
+ uses: actions/setup-python@v5
17
+ - name: Install requests
18
+ run: pip install requests
19
+ - name: Process commit and find merger responsible for labeling
20
+ id: commit
21
+ run: echo "::set-output name=merger::$(python .github/verify_pr_labels.py ${{ github.event.pull_request.number }})"
22
+ - name: 'Comment PR'
23
+ uses: actions/github-script@v7.0.1
24
+ if: ${{ steps.commit.outputs.merger != '' }}
25
+ with:
26
+ github-token: ${{ secrets.GITHUB_TOKEN }}
27
+ script: |
28
+ const { issue: { number: issue_number }, repo: { owner, repo } } = context;
29
+ github.rest.issues.createComment({ issue_number, owner, repo, body: 'Hey ${{ steps.commit.outputs.merger }} 👋\nYou merged this PR, but it is not correctly labeled. The list of valid labels is available at https://github.com/mindee/doctr/blob/main/.github/verify_pr_labels.py' });
src/python-doctr/.github/workflows/public_docker_images.yml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages
2
+ #
3
+ name: Docker image on ghcr.io
4
+
5
+ on:
6
+ push:
7
+ tags:
8
+ - 'v*'
9
+ pull_request:
10
+ branches: main
11
+ schedule:
12
+ - cron: '0 2 1 */3 *' # At 02:00 on the 1st day of every 3rd month
13
+
14
+ env:
15
+ REGISTRY: ghcr.io
16
+
17
+ jobs:
18
+ build-and-push-image:
19
+ runs-on: ubuntu-latest
20
+
21
+ strategy:
22
+ fail-fast: false
23
+ matrix:
24
+ # Must match version at https://www.python.org/ftp/python/
25
+ python: ["3.10.13", "3.11.8", "3.12.7"]
26
+ # NOTE: Since docTR 1.0.0 torch doesn't exist as a seperate install option it's only to keep the naming convention
27
+ framework: ["torch", "torch,viz,html,contrib"]
28
+
29
+ # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
30
+ permissions:
31
+ contents: read
32
+ packages: write
33
+
34
+ steps:
35
+ - name: Checkout repository
36
+ uses: actions/checkout@v5
37
+
38
+ - name: Log in to the Container registry
39
+ uses: docker/login-action@v3
40
+ with:
41
+ registry: ${{ env.REGISTRY }}
42
+ username: ${{ github.actor }}
43
+ password: ${{ secrets.GITHUB_TOKEN }}
44
+
45
+ - name: Sanitize docker tag
46
+ run: |
47
+ PREFIX_DOCKER_TAG="${{ matrix.framework }}-py${{ matrix.python }}-"
48
+ PREFIX_DOCKER_TAG=$(echo ${PREFIX_DOCKER_TAG}|sed 's/,/-/g')
49
+ echo PREFIX_DOCKER_TAG=${PREFIX_DOCKER_TAG} >> $GITHUB_ENV
50
+ echo $PREFIX_DOCKER_TAG
51
+
52
+ - name: Extract metadata (tags, labels) for Docker
53
+ id: meta
54
+ uses: docker/metadata-action@v5
55
+ with:
56
+ images: ${{ env.REGISTRY }}/${{ github.repository }}
57
+ tags: |
58
+ # used only on schedule event
59
+ type=schedule,pattern={{date 'YYYY-MM'}},prefix=${{ env.PREFIX_DOCKER_TAG }}
60
+ # used only if a tag following semver is published
61
+ type=semver,pattern={{raw}},prefix=${{ env.PREFIX_DOCKER_TAG }}
62
+
63
+ - name: Build Docker image
64
+ id: build
65
+ uses: docker/build-push-action@v6
66
+ with:
67
+ context: .
68
+ build-args: |
69
+ FRAMEWORK=${{ matrix.framework }}
70
+ PYTHON_VERSION=${{ matrix.python }}
71
+ DOCTR_REPO=${{ github.repository }}
72
+ DOCTR_VERSION=${{ github.sha }}
73
+ push: false # push only if `import doctr` works
74
+ tags: ${{ steps.meta.outputs.tags }}
75
+
76
+ - name: Check if `import doctr` works
77
+ run: docker run ${{ steps.build.outputs.imageid }} python3 -c 'import doctr'
78
+
79
+ - name: Push Docker image
80
+ # Push only if the CI is not triggered by "PR on main"
81
+ if: ${{ (github.ref == 'refs/heads/main' && github.event_name != 'pull_request') || (startsWith(github.ref, 'refs/tags') && github.event_name == 'push') }}
82
+ uses: docker/build-push-action@v6
83
+ with:
84
+ context: .
85
+ build-args: |
86
+ FRAMEWORK=${{ matrix.framework }}
87
+ PYTHON_VERSION=${{ matrix.python }}
88
+ DOCTR_REPO=${{ github.repository }}
89
+ DOCTR_VERSION=${{ github.sha }}
90
+ push: true
91
+ tags: ${{ steps.meta.outputs.tags }}
src/python-doctr/.github/workflows/publish.yml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: publish
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ pypi:
9
+ if: "!github.event.release.prerelease"
10
+ strategy:
11
+ fail-fast: false
12
+ matrix:
13
+ os: [ubuntu-latest]
14
+ python: ["3.10"]
15
+ runs-on: ${{ matrix.os }}
16
+ steps:
17
+ - uses: actions/checkout@v5
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: ${{ matrix.python }}
22
+ architecture: x64
23
+ - name: Cache python modules
24
+ uses: actions/cache@v4
25
+ with:
26
+ path: ~/.cache/pip
27
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}
28
+ - name: Install dependencies
29
+ run: |
30
+ python -m pip install --upgrade pip
31
+ pip install setuptools wheel twine --upgrade
32
+ - name: Get release tag
33
+ id: release_tag
34
+ run: echo "VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_ENV
35
+ - name: Build and publish
36
+ env:
37
+ TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
38
+ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
39
+ VERSION: ${{ env.VERSION }}
40
+ run: |
41
+ BUILD_VERSION=$VERSION python setup.py sdist bdist_wheel
42
+ twine check dist/*
43
+ twine upload dist/*
44
+
45
+ pypi-check:
46
+ needs: pypi
47
+ if: "!github.event.release.prerelease"
48
+ strategy:
49
+ fail-fast: false
50
+ matrix:
51
+ os: [ubuntu-latest]
52
+ python: ["3.10"]
53
+ runs-on: ${{ matrix.os }}
54
+ steps:
55
+ - uses: actions/checkout@v5
56
+ - name: Set up Python
57
+ uses: actions/setup-python@v5
58
+ with:
59
+ python-version: ${{ matrix.python }}
60
+ architecture: x64
61
+ - name: Install package
62
+ run: |
63
+ python -m pip install --upgrade pip
64
+ pip install python-doctr
65
+ python -c "from importlib.metadata import version; print(version('python-doctr'))"
src/python-doctr/.github/workflows/pull_requests.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: pull_requests
2
+
3
+ on:
4
+ pull_request:
5
+ branches: main
6
+
7
+ jobs:
8
+ docs-build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v5
12
+ - name: Set up Python
13
+ uses: actions/setup-python@v5
14
+ with:
15
+ python-version: "3.10"
16
+ architecture: x64
17
+ - name: Cache python modules
18
+ uses: actions/cache@v4
19
+ with:
20
+ path: ~/.cache/pip
21
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-docs
22
+ - name: Install dependencies
23
+ run: |
24
+ python -m pip install --upgrade pip
25
+ pip install -e .[viz,html] --upgrade
26
+ pip install -e .[docs]
27
+
28
+ - name: Build documentation
29
+ run: cd docs && bash build.sh
30
+
31
+ - name: Documentation sanity check
32
+ run: test -e docs/build/index.html || exit
src/python-doctr/.github/workflows/references.yml ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: references
2
+
3
+ on:
4
+ push:
5
+ branches: main
6
+ pull_request:
7
+ branches: main
8
+
9
+ jobs:
10
+ train-char-classification:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ os: [ubuntu-latest]
16
+ python: ["3.10"]
17
+ steps:
18
+ - uses: actions/checkout@v5
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: ${{ matrix.python }}
23
+ architecture: x64
24
+
25
+ - name: Cache python modules
26
+ uses: actions/cache@v4
27
+ with:
28
+ path: ~/.cache/pip
29
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-${{ hashFiles('references/requirements.txt') }}
30
+ restore-keys: |
31
+ ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-
32
+ - name: Install dependencies
33
+ run: |
34
+ python -m pip install --upgrade pip
35
+ pip install -e .[viz,html] --upgrade
36
+ pip install -r references/requirements.txt
37
+ sudo apt-get update && sudo apt-get install fonts-freefont-ttf -y
38
+ - name: Train for a short epoch
39
+ run: python references/classification/train_character.py vit_s -b 32 --val-samples 1 --train-samples 1 --epochs 1
40
+
41
+ train-orientation-classification:
42
+ runs-on: ${{ matrix.os }}
43
+ strategy:
44
+ fail-fast: false
45
+ matrix:
46
+ os: [ubuntu-latest]
47
+ python: ["3.10"]
48
+ steps:
49
+ - uses: actions/checkout@v5
50
+ - name: Set up Python
51
+ uses: actions/setup-python@v5
52
+ with:
53
+ python-version: ${{ matrix.python }}
54
+ architecture: x64
55
+ - name: Cache python modules
56
+ uses: actions/cache@v4
57
+ with:
58
+ path: ~/.cache/pip
59
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-${{ hashFiles('references/requirements.txt') }}
60
+ restore-keys: |
61
+ ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-
62
+ - name: Install dependencies
63
+ run: |
64
+ python -m pip install --upgrade pip
65
+ pip install -e .[viz,html] --upgrade
66
+ pip install -r references/requirements.txt
67
+ - name: Download and extract detection toy set
68
+ run: |
69
+ wget https://github.com/mindee/doctr/releases/download/v0.3.1/toy_detection_set-bbbb4243.zip
70
+ sudo apt-get update && sudo apt-get install unzip -y
71
+ unzip toy_detection_set-bbbb4243.zip -d det_set
72
+ - name: Download and extract recognition toy set
73
+ run: |
74
+ wget https://github.com/mindee/doctr/releases/download/v0.3.1/toy_recogition_set-036a4d80.zip
75
+ sudo apt-get update && sudo apt-get install unzip -y
76
+ unzip toy_recogition_set-036a4d80.zip -d reco_set
77
+ - name: Train for a short epoch (document orientation)
78
+ run: python references/classification/train_orientation.py resnet18 --type page --train_path ./det_set --val_path ./det_set -b 2 --epochs 1
79
+ - name: Train for a short epoch (crop orientation)
80
+ run: python references/classification/train_orientation.py resnet18 --type crop --train_path ./reco_set --val_path ./reco_set -b 4 --epochs 1
81
+
82
+ train-text-recognition:
83
+ runs-on: ${{ matrix.os }}
84
+ strategy:
85
+ fail-fast: false
86
+ matrix:
87
+ os: [ubuntu-latest]
88
+ python: ["3.10"]
89
+ steps:
90
+ - uses: actions/checkout@v5
91
+ - name: Set up Python
92
+ uses: actions/setup-python@v5
93
+ with:
94
+ python-version: ${{ matrix.python }}
95
+ architecture: x64
96
+ - name: Cache python modules
97
+ uses: actions/cache@v4
98
+ with:
99
+ path: ~/.cache/pip
100
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-${{ hashFiles('references/requirements.txt') }}
101
+ restore-keys: |
102
+ ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-
103
+ - name: Install dependencies
104
+ run: |
105
+ python -m pip install --upgrade pip
106
+ pip install -e .[viz,html] --upgrade
107
+ pip install -r references/requirements.txt
108
+ - name: Download and extract toy set
109
+ run: |
110
+ wget https://github.com/mindee/doctr/releases/download/v0.3.1/toy_recogition_set-036a4d80.zip
111
+ sudo apt-get update && sudo apt-get install unzip -y
112
+ unzip toy_recogition_set-036a4d80.zip -d reco_set
113
+ - name: Train for a short epoch
114
+ run: python references/recognition/train.py crnn_mobilenet_v3_small --train_path ./reco_set --val_path ./reco_set -b 4 --epochs 1
115
+
116
+ evaluate-text-recognition:
117
+ runs-on: ${{ matrix.os }}
118
+ strategy:
119
+ fail-fast: false
120
+ matrix:
121
+ os: [ubuntu-latest]
122
+ python: ["3.10"]
123
+ steps:
124
+ - uses: actions/checkout@v5
125
+ - name: Set up Python
126
+ uses: actions/setup-python@v5
127
+ with:
128
+ python-version: ${{ matrix.python }}
129
+ architecture: x64
130
+ - name: Cache python modules
131
+ uses: actions/cache@v4
132
+ with:
133
+ path: ~/.cache/pip
134
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}
135
+ - name: Install dependencies
136
+ run: |
137
+ python -m pip install --upgrade pip
138
+ pip install -e .[viz,html] --upgrade
139
+ - name: Evaluate text recognition
140
+ run: python references/recognition/evaluate.py crnn_mobilenet_v3_small --dataset SVT -b 32
141
+
142
+ latency-text-recognition:
143
+ runs-on: ${{ matrix.os }}
144
+ strategy:
145
+ fail-fast: false
146
+ matrix:
147
+ os: [ubuntu-latest]
148
+ python: ["3.10"]
149
+ steps:
150
+ - uses: actions/checkout@v5
151
+ - name: Set up Python
152
+ uses: actions/setup-python@v5
153
+ with:
154
+ python-version: ${{ matrix.python }}
155
+ architecture: x64
156
+ - name: Cache python modules
157
+ uses: actions/cache@v4
158
+ with:
159
+ path: ~/.cache/pip
160
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}
161
+ - name: Install dependencies
162
+ run: |
163
+ python -m pip install --upgrade pip
164
+ pip install -e .[viz,html] --upgrade
165
+ - name: Benchmark latency
166
+ run: python references/recognition/latency.py crnn_mobilenet_v3_small --it 5
167
+
168
+ train-text-detection:
169
+ runs-on: ${{ matrix.os }}
170
+ strategy:
171
+ fail-fast: false
172
+ matrix:
173
+ os: [ubuntu-latest]
174
+ python: ["3.10"]
175
+ steps:
176
+ - uses: actions/checkout@v5
177
+ - name: Set up Python
178
+ uses: actions/setup-python@v5
179
+ with:
180
+ python-version: ${{ matrix.python }}
181
+ architecture: x64
182
+ - name: Cache python modules
183
+ uses: actions/cache@v4
184
+ with:
185
+ path: ~/.cache/pip
186
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-${{ hashFiles('references/requirements.txt') }}
187
+ restore-keys: |
188
+ ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}-
189
+ - name: Install dependencies
190
+ run: |
191
+ python -m pip install --upgrade pip
192
+ pip install -e .[viz,html] --upgrade
193
+ pip install -r references/requirements.txt
194
+ - name: Download and extract toy set
195
+ run: |
196
+ wget https://github.com/mindee/doctr/releases/download/v0.3.1/toy_detection_set-bbbb4243.zip
197
+ sudo apt-get update && sudo apt-get install unzip -y
198
+ unzip toy_detection_set-bbbb4243.zip -d det_set
199
+ - name: Train for a short epoch
200
+ run: python references/detection/train.py db_mobilenet_v3_large --train_path ./det_set --val_path ./det_set -b 2 --epochs 1
201
+
202
+ evaluate-text-detection:
203
+ runs-on: ${{ matrix.os }}
204
+ strategy:
205
+ fail-fast: false
206
+ matrix:
207
+ os: [ubuntu-latest]
208
+ python: ["3.10"]
209
+ steps:
210
+ - uses: actions/checkout@v5
211
+ - name: Set up Python
212
+ uses: actions/setup-python@v5
213
+ with:
214
+ python-version: ${{ matrix.python }}
215
+ architecture: x64
216
+ - name: Cache python modules
217
+ uses: actions/cache@v4
218
+ with:
219
+ path: ~/.cache/pip
220
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}
221
+ - name: Install dependencies
222
+ run: |
223
+ python -m pip install --upgrade pip
224
+ pip install -e .[viz,html] --upgrade
225
+ pip install -r references/requirements.txt
226
+ - name: Evaluate text detection
227
+ run: python references/detection/evaluate.py db_mobilenet_v3_large
228
+
229
+ latency-text-detection:
230
+ runs-on: ${{ matrix.os }}
231
+ strategy:
232
+ fail-fast: false
233
+ matrix:
234
+ os: [ubuntu-latest]
235
+ python: ["3.10"]
236
+ steps:
237
+ - uses: actions/checkout@v5
238
+ - name: Set up Python
239
+ uses: actions/setup-python@v5
240
+ with:
241
+ python-version: ${{ matrix.python }}
242
+ architecture: x64
243
+ - name: Cache python modules
244
+ uses: actions/cache@v4
245
+ with:
246
+ path: ~/.cache/pip
247
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('requirements-pt.txt') }}
248
+ - name: Install dependencies
249
+ run: |
250
+ python -m pip install --upgrade pip
251
+ pip install -e .[viz,html] --upgrade
252
+ - name: Benchmark latency
253
+ run: python references/detection/latency.py db_mobilenet_v3_large --it 5 --size 512
src/python-doctr/.github/workflows/scripts.yml ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: scripts
2
+
3
+ on:
4
+ push:
5
+ branches: main
6
+ pull_request:
7
+ branches: main
8
+
9
+ jobs:
10
+ test-analyze:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ os: [ubuntu-latest]
16
+ python: ["3.10", "3.11"]
17
+ steps:
18
+ - if: matrix.os == 'macos-latest'
19
+ name: Install MacOS prerequisites
20
+ run: brew install cairo pango gdk-pixbuf libffi
21
+ - uses: actions/checkout@v5
22
+ - name: Set up Python
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: ${{ matrix.python }}
26
+ architecture: x64
27
+ - name: Cache python modules
28
+ uses: actions/cache@v4
29
+ with:
30
+ path: ~/.cache/pip
31
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}
32
+ - name: Install package
33
+ run: |
34
+ python -m pip install --upgrade pip
35
+ pip install -e .[viz,html] --upgrade
36
+
37
+ - name: Run analysis script
38
+ run: |
39
+ wget https://github.com/mindee/doctr/releases/download/v0.1.0/sample.pdf
40
+ python scripts/analyze.py sample.pdf --noblock --detection db_mobilenet_v3_large
41
+
42
+ test-detect-text:
43
+ runs-on: ${{ matrix.os }}
44
+ strategy:
45
+ fail-fast: false
46
+ matrix:
47
+ os: [ubuntu-latest]
48
+ python: ["3.10", "3.11"]
49
+ steps:
50
+ - if: matrix.os == 'macos-latest'
51
+ name: Install MacOS prerequisites
52
+ run: brew install cairo pango gdk-pixbuf libffi
53
+ - uses: actions/checkout@v5
54
+ - name: Set up Python
55
+ uses: actions/setup-python@v5
56
+ with:
57
+ python-version: ${{ matrix.python }}
58
+ architecture: x64
59
+ - name: Cache python modules
60
+ uses: actions/cache@v4
61
+ with:
62
+ path: ~/.cache/pip
63
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}
64
+ - name: Install package
65
+ run: |
66
+ python -m pip install --upgrade pip
67
+ pip install -e .[viz,html] --upgrade
68
+
69
+ - name: Run detection script
70
+ run: |
71
+ wget https://github.com/mindee/doctr/releases/download/v0.1.0/sample.pdf
72
+ python scripts/detect_text.py sample.pdf --detection db_mobilenet_v3_large
73
+
74
+ test-evaluate:
75
+ runs-on: ${{ matrix.os }}
76
+ strategy:
77
+ fail-fast: false
78
+ matrix:
79
+ os: [ubuntu-latest]
80
+ python: ["3.10", "3.11"]
81
+ steps:
82
+ - if: matrix.os == 'macos-latest'
83
+ name: Install MacOS prerequisites
84
+ run: brew install cairo pango gdk-pixbuf libffi
85
+ - uses: actions/checkout@v5
86
+ - name: Set up Python
87
+ uses: actions/setup-python@v5
88
+ with:
89
+ python-version: ${{ matrix.python }}
90
+ architecture: x64
91
+ - name: Cache python modules
92
+ uses: actions/cache@v4
93
+ with:
94
+ path: ~/.cache/pip
95
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}
96
+ - name: Install package
97
+ run: |
98
+ python -m pip install --upgrade pip
99
+ pip install -e .[viz,html] --upgrade
100
+ - name: Run evaluation script
101
+ run: |
102
+ python scripts/evaluate.py db_resnet50 crnn_vgg16_bn --samples 10
103
+ python scripts/evaluate_kie.py db_resnet50 crnn_vgg16_bn --samples 10
104
+
105
+ test-collectenv:
106
+ runs-on: ${{ matrix.os }}
107
+ strategy:
108
+ fail-fast: false
109
+ matrix:
110
+ os: [ubuntu-latest, macos-latest, windows-latest]
111
+ python: ["3.10", "3.11"]
112
+ steps:
113
+ - uses: actions/checkout@v5
114
+ - name: Set up Python
115
+ uses: actions/setup-python@v5
116
+ with:
117
+ # MacOS issue ref.: https://github.com/actions/setup-python/issues/855 & https://github.com/actions/setup-python/issues/865
118
+ python-version: ${{ matrix.os == 'macos-latest' && matrix.python == '3.10' && '3.11' || matrix.python }}
119
+ architecture: x64
120
+ - name: Run environment collection script
121
+ run: python scripts/collect_env.py
src/python-doctr/.github/workflows/style.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: style
2
+
3
+ on:
4
+ push:
5
+ branches: main
6
+ pull_request:
7
+ branches: main
8
+
9
+ jobs:
10
+ ruff:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ matrix:
14
+ os: [ubuntu-latest]
15
+ python: ["3.10"]
16
+ steps:
17
+ - uses: actions/checkout@v5
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: ${{ matrix.python }}
22
+ architecture: x64
23
+ - name: Run ruff
24
+ run: |
25
+ pip install ruff --upgrade
26
+ ruff --version
27
+ ruff check --diff .
28
+
29
+ mypy:
30
+ runs-on: ${{ matrix.os }}
31
+ strategy:
32
+ matrix:
33
+ os: [ubuntu-latest]
34
+ python: ["3.10"]
35
+ steps:
36
+ - uses: actions/checkout@v5
37
+ - name: Set up Python
38
+ uses: actions/setup-python@v5
39
+ with:
40
+ python-version: ${{ matrix.python }}
41
+ architecture: x64
42
+ - name: Cache python modules
43
+ uses: actions/cache@v4
44
+ with:
45
+ path: ~/.cache/pip
46
+ key: ${{ runner.os }}-pkg-deps-${{ matrix.python }}-${{ hashFiles('pyproject.toml') }}-style
47
+ - name: Install dependencies
48
+ run: |
49
+ python -m pip install --upgrade pip
50
+ pip install -e .[dev] --upgrade
51
+ pip install mypy --upgrade
52
+ - name: Run mypy
53
+ run: |
54
+ mypy --version
55
+ mypy
src/python-doctr/.gitignore ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # Temp files
132
+ doctr/version.py
133
+ logs/
134
+ wandb/
135
+ .idea/
136
+
137
+ # Checkpoints
138
+ *.pt
139
+ *.pb
140
+ *.index
src/python-doctr/.pre-commit-config.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v6.0.0
4
+ hooks:
5
+ - id: check-ast
6
+ - id: check-yaml
7
+ exclude: .conda
8
+ - id: check-toml
9
+ - id: check-json
10
+ - id: check-added-large-files
11
+ exclude: docs/images/
12
+ - id: end-of-file-fixer
13
+ - id: trailing-whitespace
14
+ - id: debug-statements
15
+ - id: check-merge-conflict
16
+ - id: no-commit-to-branch
17
+ args: ['--branch', 'main']
18
+ - repo: https://github.com/astral-sh/ruff-pre-commit
19
+ rev: v0.12.8
20
+ hooks:
21
+ - id: ruff
22
+ args: [ --fix ]
23
+ - id: ruff-format
src/python-doctr/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, religion, or sexual identity
10
+ and orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment for our
18
+ community include:
19
+
20
+ * Demonstrating empathy and kindness toward other people
21
+ * Being respectful of differing opinions, viewpoints, and experiences
22
+ * Giving and gracefully accepting constructive feedback
23
+ * Accepting responsibility and apologizing to those affected by our mistakes,
24
+ and learning from the experience
25
+ * Focusing on what is best not just for us as individuals, but for the
26
+ overall community
27
+
28
+ Examples of unacceptable behavior include:
29
+
30
+ * The use of sexualized language or imagery, and sexual attention or
31
+ advances of any kind
32
+ * Trolling, insulting or derogatory comments, and personal or political attacks
33
+ * Public or private harassment
34
+ * Publishing others' private information, such as a physical or email
35
+ address, without their explicit permission
36
+ * Other conduct which could reasonably be considered inappropriate in a
37
+ professional setting
38
+
39
+ ## Enforcement Responsibilities
40
+
41
+ Community leaders are responsible for clarifying and enforcing our standards of
42
+ acceptable behavior and will take appropriate and fair corrective action in
43
+ response to any behavior that they deem inappropriate, threatening, offensive,
44
+ or harmful.
45
+
46
+ Community leaders have the right and responsibility to remove, edit, or reject
47
+ comments, commits, code, wiki edits, issues, and other contributions that are
48
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
49
+ decisions when appropriate.
50
+
51
+ ## Scope
52
+
53
+ This Code of Conduct applies within all community spaces, and also applies when
54
+ an individual is officially representing the community in public spaces.
55
+ Examples of representing our community include using an official e-mail address,
56
+ posting via an official social media account, or acting as an appointed
57
+ representative at an online or offline event.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported to the community leaders responsible for enforcement at
63
+ contact@mindee.com.
64
+ All complaints will be reviewed and investigated promptly and fairly.
65
+
66
+ All community leaders are obligated to respect the privacy and security of the
67
+ reporter of any incident.
68
+
69
+ ## Enforcement Guidelines
70
+
71
+ Community leaders will follow these Community Impact Guidelines in determining
72
+ the consequences for any action they deem in violation of this Code of Conduct:
73
+
74
+ ### 1. Correction
75
+
76
+ **Community Impact**: Use of inappropriate language or other behavior deemed
77
+ unprofessional or unwelcome in the community.
78
+
79
+ **Consequence**: A private, written warning from community leaders, providing
80
+ clarity around the nature of the violation and an explanation of why the
81
+ behavior was inappropriate. A public apology may be requested.
82
+
83
+ ### 2. Warning
84
+
85
+ **Community Impact**: A violation through a single incident or series
86
+ of actions.
87
+
88
+ **Consequence**: A warning with consequences for continued behavior. No
89
+ interaction with the people involved, including unsolicited interaction with
90
+ those enforcing the Code of Conduct, for a specified period of time. This
91
+ includes avoiding interactions in community spaces as well as external channels
92
+ like social media. Violating these terms may lead to a temporary or
93
+ permanent ban.
94
+
95
+ ### 3. Temporary Ban
96
+
97
+ **Community Impact**: A serious violation of community standards, including
98
+ sustained inappropriate behavior.
99
+
100
+ **Consequence**: A temporary ban from any sort of interaction or public
101
+ communication with the community for a specified period of time. No public or
102
+ private interaction with the people involved, including unsolicited interaction
103
+ with those enforcing the Code of Conduct, is allowed during this period.
104
+ Violating these terms may lead to a permanent ban.
105
+
106
+ ### 4. Permanent Ban
107
+
108
+ **Community Impact**: Demonstrating a pattern of violation of community
109
+ standards, including sustained inappropriate behavior, harassment of an
110
+ individual, or aggression toward or disparagement of classes of individuals.
111
+
112
+ **Consequence**: A permanent ban from any sort of public interaction within
113
+ the community.
114
+
115
+ ## Attribution
116
+
117
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118
+ version 2.0, available at
119
+ https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120
+
121
+ Community Impact Guidelines were inspired by [Mozilla's code of conduct
122
+ enforcement ladder](https://github.com/mozilla/diversity).
123
+
124
+ [homepage]: https://www.contributor-covenant.org
125
+
126
+ For answers to common questions about this code of conduct, see the FAQ at
127
+ https://www.contributor-covenant.org/faq. Translations are available at
128
+ https://www.contributor-covenant.org/translations.
src/python-doctr/CONTRIBUTING.md ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to docTR
2
+
3
+ Everything you need to know to contribute efficiently to the project.
4
+
5
+ ## Codebase structure
6
+
7
+ - [doctr](https://github.com/mindee/doctr/blob/main/doctr) - The package codebase
8
+ - [tests](https://github.com/mindee/doctr/blob/main/tests) - Python unit tests
9
+ - [docs](https://github.com/mindee/doctr/blob/main/docs) - Library documentation building
10
+ - [scripts](https://github.com/mindee/doctr/blob/main/scripts) - Example scripts
11
+ - [references](https://github.com/mindee/doctr/blob/main/references) - Reference training scripts
12
+ - [demo](https://github.com/mindee/doctr/blob/main/demo) - Small demo app to showcase docTR capabilities
13
+ - [api](https://github.com/mindee/doctr/blob/main/api) - A minimal template to deploy a REST API with docTR
14
+
15
+ ## Continuous Integration
16
+
17
+ This project uses the following integrations to ensure proper codebase maintenance:
18
+
19
+ - [Github Workflow](https://help.github.com/en/actions/configuring-and-managing-workflows/configuring-a-workflow) - run jobs for package build and coverage
20
+ - [Codecov](https://codecov.io/) - reports back coverage results
21
+
22
+ As a contributor, you will only have to ensure coverage of your code by adding appropriate unit testing of your code.
23
+
24
+ ## Feedback
25
+
26
+ ### Feature requests & bug report
27
+
28
+ Whether you encountered a problem, or you have a feature suggestion, your input has value and can be used by contributors to reference it in their developments. For this purpose, we advise you to use Github [issues](https://github.com/mindee/doctr/issues).
29
+
30
+ First, check whether the topic wasn't already covered in an open / closed issue. If not, feel free to open a new one! When doing so, use issue templates whenever possible and provide enough information for other contributors to jump in.
31
+
32
+ ### Questions
33
+
34
+ If you are wondering how to do something with docTR, or a more general question, you should consider checking out Github [discussions](https://github.com/mindee/doctr/discussions). See it as a Q&A forum, or the docTR-specific StackOverflow!
35
+
36
+ ## Developing docTR
37
+
38
+ ### Developer mode installation
39
+
40
+ Install all additional dependencies with the following command:
41
+
42
+ ```shell
43
+ python -m pip install --upgrade pip
44
+ pip install -e '.[dev]'
45
+ pre-commit install
46
+ ```
47
+
48
+ ### Commits
49
+
50
+ - **Code**: ensure to provide docstrings to your Python code. In doing so, please follow [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) so it can ease the process of documentation later.
51
+ - **Commit message**: please follow [Udacity guide](http://udacity.github.io/git-styleguide/)
52
+
53
+ ### Unit tests
54
+
55
+ In order to run the same unit tests as the CI workflows, you can run unittests locally:
56
+
57
+ ```shell
58
+ make test
59
+ ```
60
+
61
+ ### Code quality
62
+
63
+ To run all quality checks together
64
+
65
+ ```shell
66
+ make quality
67
+ ```
68
+
69
+ #### Code style verification
70
+
71
+ To run all style checks together
72
+
73
+ ```shell
74
+ make style
75
+ ```
76
+
77
+ ### Modifying the documentation
78
+
79
+ The current documentation is built using `sphinx` thanks to our CI.
80
+ You can build the documentation locally:
81
+
82
+ ```shell
83
+ make docs-single-version
84
+ ```
85
+
86
+ Please note that files that have not been modified will not be rebuilt. If you want to force a complete rebuild, you can delete the `_build` directory. Additionally, you may need to clear your web browser's cache to see the modifications.
87
+
88
+ You can now open your local version of the documentation located at `docs/_build/index.html` in your browser
89
+
90
+ ## Let's connect
91
+
92
+ Should you wish to connect somewhere else than on GitHub, feel free to join us on [Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-uzgmljfl-MotFVfH~IdEZxjp~0zldww), where you will find a `#doctr` channel!
src/python-doctr/Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.2.0-base-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+ ENV LANG=C.UTF-8
5
+ ENV PYTHONUNBUFFERED=1
6
+ ENV PYTHONDONTWRITEBYTECODE=1
7
+
8
+
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ # - Other packages
11
+ build-essential \
12
+ pkg-config \
13
+ curl \
14
+ wget \
15
+ software-properties-common \
16
+ unzip \
17
+ git \
18
+ # - Packages to build Python
19
+ tar make gcc zlib1g-dev libffi-dev libssl-dev liblzma-dev libbz2-dev libsqlite3-dev \
20
+ # - Packages for docTR
21
+ libgl1-mesa-dev libsm6 libxext6 libxrender-dev libpangocairo-1.0-0 \
22
+ && apt-get clean \
23
+ && rm -rf /var/lib/apt/lists/*
24
+
25
+ # Install Python
26
+ ARG PYTHON_VERSION=3.10.13
27
+
28
+ RUN wget http://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
29
+ tar -zxf Python-$PYTHON_VERSION.tgz && \
30
+ cd Python-$PYTHON_VERSION && \
31
+ mkdir /opt/python/ && \
32
+ ./configure --prefix=/opt/python && \
33
+ make && \
34
+ make install && \
35
+ cd .. && \
36
+ rm Python-$PYTHON_VERSION.tgz && \
37
+ rm -r Python-$PYTHON_VERSION
38
+
39
+ ENV PATH=/opt/python/bin:$PATH
40
+
41
+ # Install docTR
42
+ ARG FRAMEWORK=torch
43
+ ARG DOCTR_REPO='mindee/doctr'
44
+ ARG DOCTR_VERSION=main
45
+ RUN pip3 install -U pip setuptools wheel && \
46
+ pip3 install "python-doctr[$FRAMEWORK]@git+https://github.com/$DOCTR_REPO.git@$DOCTR_VERSION"
src/python-doctr/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2022 Mindee
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
src/python-doctr/Makefile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: quality style test test-common test-tf test-torch docs-single-version docs
2
+ # this target runs checks on all files
3
+ quality:
4
+ ruff check .
5
+ mypy doctr/
6
+
7
+ # this target runs checks on all files and potentially modifies some of them
8
+ style:
9
+ ruff format .
10
+ ruff check --fix .
11
+
12
+ # Run tests for the library
13
+ test:
14
+ coverage run -m pytest tests/common/ -rs
15
+ coverage run -m pytest tests/pytorch/ -rs
16
+
17
+ test-common:
18
+ coverage run -m pytest tests/common/ -rs
19
+
20
+ test-torch:
21
+ coverage run -m pytest tests/pytorch/ -rs
22
+
23
+ # Check that docs can build
24
+ docs-single-version:
25
+ sphinx-build docs/source docs/_build -a
26
+
27
+ # Check that docs can build
28
+ docs:
29
+ cd docs && bash build.sh
src/python-doctr/README.md ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <img src="https://github.com/mindee/doctr/raw/main/docs/images/Logo_doctr.gif" width="40%">
3
+ </p>
4
+
5
+ [![Slack Icon](https://img.shields.io/badge/Slack-Community-4A154B?style=flat-square&logo=slack&logoColor=white)](https://slack.mindee.com) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) ![Build Status](https://github.com/mindee/doctr/workflows/builds/badge.svg) [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/mindee/doctr/pkgs/container/doctr) [![codecov](https://codecov.io/gh/mindee/doctr/branch/main/graph/badge.svg?token=577MO567NM)](https://codecov.io/gh/mindee/doctr) [![CodeFactor](https://www.codefactor.io/repository/github/mindee/doctr/badge?s=bae07db86bb079ce9d6542315b8c6e70fa708a7e)](https://www.codefactor.io/repository/github/mindee/doctr) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/340a76749b634586a498e1c0ab998f08)](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [![Doc Status](https://github.com/mindee/doctr/workflows/doc-status/badge.svg)](https://mindee.github.io/doctr) [![Pypi](https://img.shields.io/badge/pypi-v1.0.0-blue.svg)](https://pypi.org/project/python-doctr/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) [![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20docTR%20Guru-006BFF)](https://gurubase.io/g/doctr)
6
+
7
+
8
+ **Optical Character Recognition made seamless & accessible to anyone, powered by PyTorch**
9
+
10
+ What you can expect from this repository:
11
+
12
+ - efficient ways to parse textual information (localize and identify each word) from your documents
13
+ - guidance on how to integrate this in your current architecture
14
+
15
+ ![OCR_example](https://github.com/mindee/doctr/raw/main/docs/images/ocr.png)
16
+
17
+ ## Quick Tour
18
+
19
+ ### Getting your pretrained model
20
+
21
+ End-to-End OCR is achieved in docTR using a two-stage approach: text detection (localizing words), then text recognition (identify all characters in the word).
22
+ As such, you can select the architecture used for [text detection](https://mindee.github.io/doctr/latest/modules/models.html#doctr-models-detection), and the one for [text recognition](https://mindee.github.io/doctr/latest//modules/models.html#doctr-models-recognition) from the list of available implementations.
23
+
24
+ ```python
25
+ from doctr.models import ocr_predictor
26
+
27
+ model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
28
+ ```
29
+
30
+ ### Reading files
31
+
32
+ Documents can be interpreted from PDF or images:
33
+
34
+ ```python
35
+ from doctr.io import DocumentFile
36
+ # PDF
37
+ pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
38
+ # Image
39
+ single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
40
+ # Webpage (requires `weasyprint` to be installed)
41
+ webpage_doc = DocumentFile.from_url("https://www.yoursite.com")
42
+ # Multiple page images
43
+ multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jpg"])
44
+ ```
45
+
46
+ ### Putting it together
47
+
48
+ Let's use the default pretrained model for an example:
49
+
50
+ ```python
51
+ from doctr.io import DocumentFile
52
+ from doctr.models import ocr_predictor
53
+
54
+ model = ocr_predictor(pretrained=True)
55
+ # PDF
56
+ doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
57
+ # Analyze
58
+ result = model(doc)
59
+ ```
60
+
61
+ ### Dealing with rotated documents
62
+
63
+ Should you use docTR on documents that include rotated pages, or pages with multiple box orientations,
64
+ you have multiple options to handle it:
65
+
66
+ - If you only use straight document pages with straight words (horizontal, same reading direction),
67
+ consider passing `assume_straight_pages=True` to the ocr_predictor. It will directly fit straight boxes
68
+ on your page and return straight boxes, which makes it the fastest option.
69
+
70
+ - If you want the predictor to output straight boxes (no matter the orientation of your pages, the final localizations
71
+ will be converted to straight boxes), you need to pass `export_as_straight_boxes=True` in the predictor. Otherwise, if `assume_straight_pages=False`, it will return rotated bounding boxes (potentially with an angle of 0°).
72
+
73
+ If both options are set to False, the predictor will always fit and return rotated boxes.
74
+
75
+ To interpret your model's predictions, you can visualize them interactively as follows:
76
+
77
+ ```python
78
+ # Display the result (requires matplotlib & mplcursors to be installed)
79
+ result.show()
80
+ ```
81
+
82
+ ![Visualization sample](https://github.com/mindee/doctr/raw/main/docs/images/doctr_example_script.gif)
83
+
84
+ Or even rebuild the original document from its predictions:
85
+
86
+ ```python
87
+ import matplotlib.pyplot as plt
88
+
89
+ synthetic_pages = result.synthesize()
90
+ plt.imshow(synthetic_pages[0]); plt.axis('off'); plt.show()
91
+ ```
92
+
93
+ ![Synthesis sample](https://github.com/mindee/doctr/raw/main/docs/images/synthesized_sample.png)
94
+
95
+ The `ocr_predictor` returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`).
96
+ To get a better understanding of our document model, check our [documentation](https://mindee.github.io/doctr/modules/io.html#document-structure):
97
+
98
+ You can also export them as a nested dict, more appropriate for JSON format:
99
+
100
+ ```python
101
+ json_output = result.export()
102
+ ```
103
+
104
+ ### Use the KIE predictor
105
+
106
+ The KIE predictor is a more flexible predictor compared to OCR as your detection model can detect multiple classes in a document. For example, you can have a detection model to detect just dates and addresses in a document.
107
+
108
+ The KIE predictor makes it possible to use detector with multiple classes with a recognition model and to have the whole pipeline already setup for you.
109
+
110
+ ```python
111
+ from doctr.io import DocumentFile
112
+ from doctr.models import kie_predictor
113
+
114
+ # Model
115
+ model = kie_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
116
+ # PDF
117
+ doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
118
+ # Analyze
119
+ result = model(doc)
120
+
121
+ predictions = result.pages[0].predictions
122
+ for class_name in predictions.keys():
123
+ list_predictions = predictions[class_name]
124
+ for prediction in list_predictions:
125
+ print(f"Prediction for {class_name}: {prediction}")
126
+ ```
127
+
128
+ The KIE predictor results per page are in a dictionary format with each key representing a class name and it's value are the predictions for that class.
129
+
130
+ ### If you are looking for support from the Mindee team
131
+
132
+ [![Bad OCR test detection image asking the developer if they need help](https://github.com/mindee/doctr/raw/main/docs/images/doctr-need-help.png)](https://mindee.com/product/doctr)
133
+
134
+ ## Installation
135
+
136
+ ### Prerequisites
137
+
138
+ Python 3.10 (or higher) and [pip](https://pip.pypa.io/en/stable/) are required to install docTR.
139
+
140
+ ### Latest release
141
+
142
+ You can then install the latest release of the package using [pypi](https://pypi.org/project/python-doctr/) as follows:
143
+
144
+ ```shell
145
+ pip install python-doctr
146
+ ```
147
+
148
+ We try to keep extra dependencies to a minimum. You can install specific builds as follows:
149
+
150
+ ```shell
151
+ # standard build
152
+ pip install python-doctr
153
+ # optional dependencies for visualization, html, and contrib modules can be installed as follows:
154
+ pip install "python-doctr[viz,html,contrib]"
155
+ ```
156
+
157
+ ### Developer mode
158
+
159
+ Alternatively, you can install it from source, which will require you to install [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
160
+ First clone the project repository:
161
+
162
+ ```shell
163
+ git clone https://github.com/mindee/doctr.git
164
+ pip install -e doctr/.
165
+ ```
166
+
167
+ Again, if you prefer to avoid the risk of missing dependencies, you can install the build:
168
+
169
+ ```shell
170
+ pip install -e doctr/.
171
+ ```
172
+
173
+ ## Models architectures
174
+
175
+ Credits where it's due: this repository is implementing, among others, architectures from published research papers.
176
+
177
+ ### Text Detection
178
+
179
+ - DBNet: [Real-time Scene Text Detection with Differentiable Binarization](https://arxiv.org/pdf/1911.08947.pdf).
180
+ - LinkNet: [LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation](https://arxiv.org/pdf/1707.03718.pdf)
181
+ - FAST: [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/pdf/2111.02394.pdf)
182
+
183
+ ### Text Recognition
184
+
185
+ - CRNN: [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/pdf/1507.05717.pdf).
186
+ - SAR: [Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition](https://arxiv.org/pdf/1811.00751.pdf).
187
+ - MASTER: [MASTER: Multi-Aspect Non-local Network for Scene Text Recognition](https://arxiv.org/pdf/1910.02562.pdf).
188
+ - ViTSTR: [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/pdf/2105.08582.pdf).
189
+ - PARSeq: [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/pdf/2207.06966).
190
+ - VIPTR: [A Vision Permutable Extractor for Fast and Efficient Scene Text Recognition](https://arxiv.org/abs/2401.10110).
191
+
192
+ ## More goodies
193
+
194
+ ### Documentation
195
+
196
+ The full package documentation is available [here](https://mindee.github.io/doctr/) for detailed specifications.
197
+
198
+ ### Demo app
199
+
200
+ A minimal demo app is provided for you to play with our end-to-end OCR models!
201
+
202
+ ![Demo app](https://github.com/mindee/doctr/raw/main/docs/images/demo_update.png)
203
+
204
+ #### Live demo
205
+
206
+ Courtesy of :hugs: [Hugging Face](https://huggingface.co/) :hugs:, docTR has now a fully deployed version available on [Spaces](https://huggingface.co/spaces)!
207
+ Check it out [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr)
208
+
209
+ #### Running it locally
210
+
211
+ If you prefer to use it locally, there is an extra dependency ([Streamlit](https://streamlit.io/)) that is required.
212
+
213
+ ```shell
214
+ pip install -r demo/pt-requirements.txt
215
+ ```
216
+
217
+ Then run your app in your default browser with:
218
+
219
+ ```shell
220
+ streamlit run demo/app.py
221
+ ```
222
+
223
+ ### Docker container
224
+
225
+ We offer Docker container support for easy testing and deployment. [Here are the available docker tags.](https://github.com/mindee/doctr/pkgs/container/doctr).
226
+
227
+ #### Using GPU with docTR Docker Images
228
+
229
+ The docTR Docker images are GPU-ready and based on CUDA `12.2`. Make sure your host is **at least `12.2`**, otherwise Torch won't be able to initialize the GPU.
230
+ Please ensure that Docker is configured to use your GPU.
231
+
232
+ To verify and configure GPU support for Docker, please follow the instructions provided in the [NVIDIA Container Toolkit Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
233
+
234
+ Once Docker is configured to use GPUs, you can run docTR Docker containers with GPU support:
235
+
236
+ ```shell
237
+ docker run -it --gpus all ghcr.io/mindee/doctr:torch-py3.9.18-2024-10 bash
238
+ ```
239
+
240
+ #### Available Tags
241
+
242
+ The Docker images for docTR follow a specific tag nomenclature: `<deps>-py<python_version>-<doctr_version|YYYY-MM>`. Here's a breakdown of the tag structure:
243
+
244
+ - `<deps>`: `torch`, `torch-viz-html-contrib`.
245
+ - `<python_version>`: `3.9.18`, `3.10.13` or `3.11.8`.
246
+ - `<doctr_version>`: a tag >= `v0.11.0`
247
+ - `<YYYY-MM>`: e.g. `2014-10`
248
+
249
+ Here are examples of different image tags:
250
+
251
+ | Tag | Description |
252
+ |----------------------------|---------------------------------------------------|
253
+ | `torch-viz-html-contrib-py3.11.8-2024-10` | Torch with extra dependencies version `3.11.8` from latest commit on `main` in `2024-10`. |
254
+ | `torch-py3.11.8-2024-10`| PyTorch version `3.11.8` from latest commit on `main` in `2024-10`. |
255
+
256
+ #### Building Docker Images Locally
257
+
258
+ You can also build docTR Docker images locally on your computer.
259
+
260
+ ```shell
261
+ docker build -t doctr .
262
+ ```
263
+
264
+ You can specify custom Python versions and docTR versions using build arguments. For example, to build a docTR image with PyTorch, Python version `3.9.10`, and docTR version `v0.7.0`, run the following command:
265
+
266
+ ```shell
267
+ docker build -t doctr --build-arg FRAMEWORK=torch --build-arg PYTHON_VERSION=3.9.10 --build-arg DOCTR_VERSION=v0.7.0 .
268
+ ```
269
+
270
+ ### Example script
271
+
272
+ An example script is provided for a simple documentation analysis of a PDF or image file:
273
+
274
+ ```shell
275
+ python scripts/analyze.py path/to/your/doc.pdf
276
+ ```
277
+
278
+ All script arguments can be checked using `python scripts/analyze.py --help`
279
+
280
+ ### Minimal API integration
281
+
282
+ Looking to integrate docTR into your API? Here is a template to get you started with a fully working API using the wonderful [FastAPI](https://github.com/tiangolo/fastapi) framework.
283
+
284
+ #### Deploy your API locally
285
+
286
+ Specific dependencies are required to run the API template, which you can install as follows:
287
+
288
+ ```shell
289
+ cd api/
290
+ pip install poetry
291
+ make lock
292
+ pip install -r requirements.txt
293
+ ```
294
+
295
+ You can now run your API locally:
296
+
297
+ ```shell
298
+ uvicorn --reload --workers 1 --host 0.0.0.0 --port=8002 --app-dir api/ app.main:app
299
+ ```
300
+
301
+ Alternatively, you can run the same server on a docker container if you prefer using:
302
+
303
+ ```shell
304
+ PORT=8002 docker-compose up -d --build
305
+ ```
306
+
307
+ #### What you have deployed
308
+
309
+ Your API should now be running locally on your port 8002. Access your automatically-built documentation at [http://localhost:8002/redoc](http://localhost:8002/redoc) and enjoy your three functional routes ("/detection", "/recognition", "/ocr", "/kie"). Here is an example with Python to send a request to the OCR route:
310
+
311
+ ```python
312
+ import requests
313
+
314
+ params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn"}
315
+
316
+ with open('/path/to/your/doc.jpg', 'rb') as f:
317
+ files = [ # application/pdf, image/jpeg, image/png supported
318
+ ("files", ("doc.jpg", f.read(), "image/jpeg")),
319
+ ]
320
+ print(requests.post("http://localhost:8080/ocr", params=params, files=files).json())
321
+ ```
322
+
323
+ ### Example notebooks
324
+
325
+ Looking for more illustrations of docTR features? You might want to check the [Jupyter notebooks](https://github.com/mindee/doctr/tree/main/notebooks) designed to give you a broader overview.
326
+
327
+ ## Citation
328
+
329
+ If you wish to cite this project, feel free to use this [BibTeX](http://www.bibtex.org/) reference:
330
+
331
+ ```bibtex
332
+ @misc{doctr2021,
333
+ title={docTR: Document Text Recognition},
334
+ author={Mindee},
335
+ year={2021},
336
+ publisher = {GitHub},
337
+ howpublished = {\url{https://github.com/mindee/doctr}}
338
+ }
339
+ ```
340
+
341
+ ## Contributing
342
+
343
+ If you scrolled down to this section, you most likely appreciate open source. Do you feel like extending the range of our supported characters? Or perhaps submitting a paper implementation? Or contributing in any other way?
344
+
345
+ You're in luck, we compiled a short guide (cf. [`CONTRIBUTING`](https://mindee.github.io/doctr/contributing/contributing.html)) for you to easily do so!
346
+
347
+ ## License
348
+
349
+ Distributed under the Apache 2.0 License. See [`LICENSE`](https://github.com/mindee/doctr?tab=Apache-2.0-1-ov-file#readme) for more information.
src/python-doctr/api/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poetry.lock
2
+ requirements*
src/python-doctr/api/Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM tiangolo/uvicorn-gunicorn-fastapi:python3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # set environment variables
6
+ ENV PYTHONDONTWRITEBYTECODE 1
7
+ ENV PYTHONUNBUFFERED 1
8
+ ENV PYTHONPATH "${PYTHONPATH}:/app"
9
+
10
+ RUN apt-get update \
11
+ && apt-get install --no-install-recommends git ffmpeg libsm6 libxext6 make -y \
12
+ && apt-get autoremove -y \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY pyproject.toml /app/pyproject.toml
16
+ COPY Makefile /app/Makefile
17
+
18
+ RUN pip install --upgrade pip setuptools wheel \
19
+ && make lock \
20
+ && pip install -r /app/requirements.txt \
21
+ && pip cache purge \
22
+ && rm -rf /root/.cache/pip
23
+
24
+ # copy project
25
+ COPY app /app/app
src/python-doctr/api/Makefile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api setup is borrowed from https://github.com/frgfm/Holocron/blob/main/api
2
+
3
+ .PHONY: lock run stop test
4
+ # Pin the dependencies
5
+ lock:
6
+ pip install poetry>=1.0 poetry-plugin-export
7
+ poetry lock
8
+ poetry export -f requirements.txt --without-hashes --output requirements.txt
9
+ poetry export -f requirements.txt --without-hashes --with dev --output requirements-dev.txt
10
+
11
+ # Run the docker
12
+ run:
13
+ docker compose up -d --build
14
+
15
+ # Run the docker
16
+ stop:
17
+ docker compose down
18
+
19
+ # Run tests for the library
20
+ test:
21
+ docker compose up -d --build
22
+ docker cp requirements-dev.txt api_web:/app/requirements-dev.txt
23
+ docker compose exec -T web pip install -r requirements-dev.txt
24
+ docker cp tests api_web:/app/tests
25
+ docker compose exec -T web pytest tests/ -vv
26
+ docker compose down
src/python-doctr/api/README.md ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Template for your OCR API using docTR
2
+
3
+ ## Installation
4
+
5
+ You will only need to install [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git), [Docker](https://docs.docker.com/get-docker/) and [poetry](https://python-poetry.org/docs/#installation). The container environment will be self-sufficient and install the remaining dependencies on its own.
6
+
7
+ ## Usage
8
+
9
+ ### Starting your web server
10
+
11
+ You will need to clone the repository first, go into `api` folder and start the api:
12
+
13
+ ```shell
14
+ git clone https://github.com/mindee/doctr.git
15
+ cd doctr/api
16
+ make run
17
+ ```
18
+
19
+ Once completed, your [FastAPI](https://fastapi.tiangolo.com/) server should be running on port 8080.
20
+
21
+ ### Documentation and swagger
22
+
23
+ FastAPI comes with many advantages including speed and OpenAPI features. For instance, once your server is running, you can access the automatically built documentation and swagger in your browser at: [http://localhost:8080/docs](http://localhost:8080/docs)
24
+
25
+ ### Using the routes
26
+
27
+ You will find detailed instructions in the live documentation when your server is up, but here are some examples to use your available API routes:
28
+
29
+ #### Text detection
30
+
31
+ Using the following image:
32
+ <img src="https://user-images.githubusercontent.com/76527547/117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg" width="50%" height="50%">
33
+
34
+ with this snippet:
35
+
36
+ ```python
37
+ import requests
38
+
39
+ headers = {"accept": "application/json"}
40
+ params = {"det_arch": "db_resnet50"}
41
+
42
+ with open('/path/to/your/img.jpg', 'rb') as f:
43
+ files = [ # application/pdf, image/jpeg, image/png supported
44
+ ("files", ("117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg", f.read(), "image/jpeg")),
45
+ ]
46
+ print(requests.post("http://localhost:8080/detection", headers=headers, params=params, files=files).json())
47
+ ```
48
+
49
+ should yield
50
+
51
+ ```json
52
+ [
53
+ {
54
+ "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
55
+ "geometries": [
56
+ [
57
+ 0.8176307908857315,
58
+ 0.1787109375,
59
+ 0.9101580212741838,
60
+ 0.2080078125
61
+ ],
62
+ [
63
+ 0.7471996155154171,
64
+ 0.1796875,
65
+ 0.8272978149561669,
66
+ 0.20703125
67
+ ]
68
+ ]
69
+ }
70
+ ]
71
+ ```
72
+
73
+ #### Text recognition
74
+
75
+ Using the following image:
76
+ ![recognition-sample](https://user-images.githubusercontent.com/76527547/117133599-c073fa00-ada4-11eb-831b-412de4d28341.jpeg)
77
+
78
+ with this snippet:
79
+
80
+ ```python
81
+ import requests
82
+
83
+ headers = {"accept": "application/json"}
84
+ params = {"reco_arch": "crnn_vgg16_bn"}
85
+
86
+ with open('/path/to/your/img.jpg', 'rb') as f:
87
+ files = [ # application/pdf, image/jpeg, image/png supported
88
+ ("files", ("117133599-c073fa00-ada4-11eb-831b-412de4d28341.jpeg", f.read(), "image/jpeg")),
89
+ ]
90
+ print(requests.post("http://localhost:8080/recognition", headers=headers, params=params, files=files).json())
91
+ ```
92
+
93
+ should yield
94
+
95
+ ```json
96
+ [
97
+ {
98
+ "name": "117133599-c073fa00-ada4-11eb-831b-412de4d28341.jpeg",
99
+ "value": "invite",
100
+ "confidence": 1.0
101
+ }
102
+ ]
103
+ ```
104
+
105
+ #### End-to-end OCR
106
+
107
+ Using the following image:
108
+ <img src="https://user-images.githubusercontent.com/76527547/117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg" width="50%" height="50%">
109
+
110
+ with this snippet:
111
+
112
+ ```python
113
+ import requests
114
+
115
+ headers = {"accept": "application/json"}
116
+ params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn"}
117
+
118
+ with open('/path/to/your/img.jpg', 'rb') as f:
119
+ files = [ # application/pdf, image/jpeg, image/png supported
120
+ ("files", ("117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg", f.read(), "image/jpeg")),
121
+ ]
122
+ print(requests.post("http://localhost:8080/ocr", headers=headers, params=params, files=files).json())
123
+ ```
124
+
125
+ should yield
126
+
127
+ ```json
128
+ [
129
+ {
130
+ "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
131
+ "orientation": {
132
+ "value": 0,
133
+ "confidence": null
134
+ },
135
+ "language": {
136
+ "value": null,
137
+ "confidence": null
138
+ },
139
+ "dimensions": [2339, 1654],
140
+ "items": [
141
+ {
142
+ "blocks": [
143
+ {
144
+ "geometry": [
145
+ 0.7471996155154171,
146
+ 0.1787109375,
147
+ 0.9101580212741838,
148
+ 0.2080078125
149
+ ],
150
+ "objectness_score": 0.5,
151
+ "lines": [
152
+ {
153
+ "geometry": [
154
+ 0.7471996155154171,
155
+ 0.1787109375,
156
+ 0.9101580212741838,
157
+ 0.2080078125
158
+ ],
159
+ "objectness_score": 0.5,
160
+ "words": [
161
+ {
162
+ "value": "Hello",
163
+ "geometry": [
164
+ 0.7471996155154171,
165
+ 0.1796875,
166
+ 0.8272978149561669,
167
+ 0.20703125
168
+ ],
169
+ "objectness_score": 0.5,
170
+ "confidence": 1.0,
171
+ "crop_orientation": {"value": 0, "confidence": null}
172
+ },
173
+ {
174
+ "value": "world!",
175
+ "geometry": [
176
+ 0.8176307908857315,
177
+ 0.1787109375,
178
+ 0.9101580212741838,
179
+ 0.2080078125
180
+ ],
181
+ "objectness_score": 0.5,
182
+ "confidence": 1.0,
183
+ "crop_orientation": {"value": 0, "confidence": null}
184
+ }
185
+ ]
186
+ }
187
+ ]
188
+ }
189
+ ]
190
+ }
191
+ ]
192
+ }
193
+ ]
194
+ ```
src/python-doctr/api/app/config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ import os
7
+
8
+ import doctr
9
+
10
+ PROJECT_NAME: str = "docTR API template"
11
+ PROJECT_DESCRIPTION: str = "Template API for Optical Character Recognition"
12
+ VERSION: str = doctr.__version__
13
+ DEBUG: bool = os.environ.get("DEBUG", "") != "False"
src/python-doctr/api/app/main.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ import time
7
+
8
+ from fastapi import FastAPI, Request
9
+ from fastapi.openapi.utils import get_openapi
10
+
11
+ from app import config as cfg
12
+ from app.routes import detection, kie, ocr, recognition
13
+
14
+ app = FastAPI(title=cfg.PROJECT_NAME, description=cfg.PROJECT_DESCRIPTION, debug=cfg.DEBUG, version=cfg.VERSION)
15
+
16
+
17
+ # Routing
18
+ app.include_router(recognition.router, prefix="/recognition", tags=["recognition"])
19
+ app.include_router(detection.router, prefix="/detection", tags=["detection"])
20
+ app.include_router(ocr.router, prefix="/ocr", tags=["ocr"])
21
+ app.include_router(kie.router, prefix="/kie", tags=["kie"])
22
+
23
+
24
+ # Middleware
25
+ @app.middleware("http")
26
+ async def add_process_time_header(request: Request, call_next):
27
+ start_time = time.time()
28
+ response = await call_next(request)
29
+ process_time = time.time() - start_time
30
+ response.headers["X-Process-Time"] = str(process_time)
31
+ return response
32
+
33
+
34
+ # Docs
35
+ def custom_openapi():
36
+ if app.openapi_schema:
37
+ return app.openapi_schema
38
+ openapi_schema = get_openapi(
39
+ title=cfg.PROJECT_NAME,
40
+ version=cfg.VERSION,
41
+ description=cfg.PROJECT_DESCRIPTION,
42
+ routes=app.routes,
43
+ )
44
+ app.openapi_schema = openapi_schema
45
+ return app.openapi_schema
46
+
47
+
48
+ app.openapi = custom_openapi
src/python-doctr/api/app/routes/detection.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+
7
+ from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
8
+
9
+ from app.schemas import DetectionIn, DetectionOut
10
+ from app.utils import get_documents, resolve_geometry
11
+ from app.vision import init_predictor
12
+ from doctr.file_utils import CLASS_NAME
13
+
14
+ router = APIRouter()
15
+
16
+
17
+ @router.post("/", response_model=list[DetectionOut], status_code=status.HTTP_200_OK, summary="Perform text detection")
18
+ async def text_detection(request: DetectionIn = Depends(), files: list[UploadFile] = [File(...)]):
19
+ """Runs docTR text detection model to analyze the input image"""
20
+ try:
21
+ predictor = init_predictor(request)
22
+ content, filenames = await get_documents(files)
23
+ except ValueError as e:
24
+ raise HTTPException(status_code=400, detail=str(e))
25
+
26
+ return [
27
+ DetectionOut(
28
+ name=filename,
29
+ geometries=[
30
+ geom[:-1].tolist() if geom.shape == (5,) else resolve_geometry(geom[:4].tolist())
31
+ for geom in doc[CLASS_NAME]
32
+ ],
33
+ )
34
+ for doc, filename in zip(predictor(content), filenames)
35
+ ]
src/python-doctr/api/app/routes/kie.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+
7
+ from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
8
+
9
+ from app.schemas import KIEElement, KIEIn, KIEOut
10
+ from app.utils import get_documents, resolve_geometry
11
+ from app.vision import init_predictor
12
+
13
+ router = APIRouter()
14
+
15
+
16
+ @router.post("/", response_model=list[KIEOut], status_code=status.HTTP_200_OK, summary="Perform KIE")
17
+ async def perform_kie(request: KIEIn = Depends(), files: list[UploadFile] = [File(...)]):
18
+ """Runs docTR KIE model to analyze the input image"""
19
+ try:
20
+ predictor = init_predictor(request)
21
+ content, filenames = await get_documents(files)
22
+ except ValueError as e:
23
+ raise HTTPException(status_code=400, detail=str(e))
24
+
25
+ out = predictor(content)
26
+
27
+ results = [
28
+ KIEOut(
29
+ name=filenames[i],
30
+ orientation=page.orientation,
31
+ language=page.language,
32
+ dimensions=page.dimensions,
33
+ predictions=[
34
+ KIEElement(
35
+ class_name=class_name,
36
+ items=[
37
+ dict(
38
+ value=prediction.value,
39
+ geometry=resolve_geometry(prediction.geometry),
40
+ objectness_score=round(prediction.objectness_score, 2),
41
+ confidence=round(prediction.confidence, 2),
42
+ crop_orientation=prediction.crop_orientation,
43
+ )
44
+ for prediction in page.predictions[class_name]
45
+ ],
46
+ )
47
+ for class_name in page.predictions.keys()
48
+ ],
49
+ )
50
+ for i, page in enumerate(out.pages)
51
+ ]
52
+
53
+ return results
src/python-doctr/api/app/routes/ocr.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+
7
+ from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
8
+
9
+ from app.schemas import OCRBlock, OCRIn, OCRLine, OCROut, OCRPage, OCRWord
10
+ from app.utils import get_documents, resolve_geometry
11
+ from app.vision import init_predictor
12
+
13
+ router = APIRouter()
14
+
15
+
16
+ @router.post("/", response_model=list[OCROut], status_code=status.HTTP_200_OK, summary="Perform OCR")
17
+ async def perform_ocr(request: OCRIn = Depends(), files: list[UploadFile] = [File(...)]):
18
+ """Runs docTR OCR model to analyze the input image"""
19
+ try:
20
+ # generator object to list
21
+ content, filenames = await get_documents(files)
22
+ predictor = init_predictor(request)
23
+ except ValueError as e:
24
+ raise HTTPException(status_code=400, detail=str(e))
25
+
26
+ out = predictor(content)
27
+
28
+ results = [
29
+ OCROut(
30
+ name=filenames[i],
31
+ orientation=page.orientation,
32
+ language=page.language,
33
+ dimensions=page.dimensions,
34
+ items=[
35
+ OCRPage(
36
+ blocks=[
37
+ OCRBlock(
38
+ geometry=resolve_geometry(block.geometry),
39
+ objectness_score=round(block.objectness_score, 2),
40
+ lines=[
41
+ OCRLine(
42
+ geometry=resolve_geometry(line.geometry),
43
+ objectness_score=round(line.objectness_score, 2),
44
+ words=[
45
+ OCRWord(
46
+ value=word.value,
47
+ geometry=resolve_geometry(word.geometry),
48
+ objectness_score=round(word.objectness_score, 2),
49
+ confidence=round(word.confidence, 2),
50
+ crop_orientation=word.crop_orientation,
51
+ )
52
+ for word in line.words
53
+ ],
54
+ )
55
+ for line in block.lines
56
+ ],
57
+ )
58
+ for block in page.blocks
59
+ ]
60
+ )
61
+ ],
62
+ )
63
+ for i, page in enumerate(out.pages)
64
+ ]
65
+
66
+ return results
src/python-doctr/api/app/routes/recognition.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+
7
+ from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
8
+
9
+ from app.schemas import RecognitionIn, RecognitionOut
10
+ from app.utils import get_documents
11
+ from app.vision import init_predictor
12
+
13
+ router = APIRouter()
14
+
15
+
16
+ @router.post(
17
+ "/", response_model=list[RecognitionOut], status_code=status.HTTP_200_OK, summary="Perform text recognition"
18
+ )
19
+ async def text_recognition(request: RecognitionIn = Depends(), files: list[UploadFile] = [File(...)]):
20
+ """Runs docTR text recognition model to analyze the input image"""
21
+ try:
22
+ predictor = init_predictor(request)
23
+ content, filenames = await get_documents(files)
24
+ except ValueError as e:
25
+ raise HTTPException(status_code=400, detail=str(e))
26
+ return [
27
+ RecognitionOut(name=filename, value=res[0], confidence=round(res[1], 2))
28
+ for res, filename in zip(predictor(content), filenames)
29
+ ]
src/python-doctr/api/app/schemas.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class KIEIn(BaseModel):
12
+ det_arch: str = Field(default="db_resnet50", examples=["db_resnet50"])
13
+ reco_arch: str = Field(default="crnn_vgg16_bn", examples=["crnn_vgg16_bn"])
14
+ assume_straight_pages: bool = Field(default=True, examples=[True])
15
+ preserve_aspect_ratio: bool = Field(default=True, examples=[True])
16
+ detect_orientation: bool = Field(default=False, examples=[False])
17
+ detect_language: bool = Field(default=False, examples=[False])
18
+ symmetric_pad: bool = Field(default=True, examples=[True])
19
+ straighten_pages: bool = Field(default=False, examples=[False])
20
+ det_bs: int = Field(default=2, examples=[2])
21
+ reco_bs: int = Field(default=128, examples=[128])
22
+ disable_page_orientation: bool = Field(default=False, examples=[False])
23
+ disable_crop_orientation: bool = Field(default=False, examples=[False])
24
+ bin_thresh: float = Field(default=0.1, examples=[0.1])
25
+ box_thresh: float = Field(default=0.1, examples=[0.1])
26
+
27
+
28
+ class OCRIn(KIEIn, BaseModel):
29
+ resolve_lines: bool = Field(default=True, examples=[True])
30
+ resolve_blocks: bool = Field(default=False, examples=[False])
31
+ paragraph_break: float = Field(default=0.0035, examples=[0.0035])
32
+
33
+
34
+ class RecognitionIn(BaseModel):
35
+ reco_arch: str = Field(default="crnn_vgg16_bn", examples=["crnn_vgg16_bn"])
36
+ reco_bs: int = Field(default=128, examples=[128])
37
+
38
+
39
+ class DetectionIn(BaseModel):
40
+ det_arch: str = Field(default="db_resnet50", examples=["db_resnet50"])
41
+ assume_straight_pages: bool = Field(default=True, examples=[True])
42
+ preserve_aspect_ratio: bool = Field(default=True, examples=[True])
43
+ symmetric_pad: bool = Field(default=True, examples=[True])
44
+ det_bs: int = Field(default=2, examples=[2])
45
+ bin_thresh: float = Field(default=0.1, examples=[0.1])
46
+ box_thresh: float = Field(default=0.1, examples=[0.1])
47
+
48
+
49
+ class RecognitionOut(BaseModel):
50
+ name: str = Field(..., examples=["example.jpg"])
51
+ value: str = Field(..., examples=["Hello"])
52
+ confidence: float = Field(..., examples=[0.99])
53
+
54
+
55
+ class DetectionOut(BaseModel):
56
+ name: str = Field(..., examples=["example.jpg"])
57
+ geometries: list[list[float]] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
58
+
59
+
60
+ class OCRWord(BaseModel):
61
+ value: str = Field(..., examples=["example"])
62
+ geometry: list[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
63
+ objectness_score: float = Field(..., examples=[0.99])
64
+ confidence: float = Field(..., examples=[0.99])
65
+ crop_orientation: dict[str, Any] = Field(..., examples=[{"value": 0, "confidence": None}])
66
+
67
+
68
+ class OCRLine(BaseModel):
69
+ geometry: list[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
70
+ objectness_score: float = Field(..., examples=[0.99])
71
+ words: list[OCRWord] = Field(
72
+ ...,
73
+ examples=[
74
+ {
75
+ "value": "example",
76
+ "geometry": [0.0, 0.0, 0.0, 0.0],
77
+ "objectness_score": 0.99,
78
+ "confidence": 0.99,
79
+ "crop_orientation": {"value": 0, "confidence": None},
80
+ }
81
+ ],
82
+ )
83
+
84
+
85
+ class OCRBlock(BaseModel):
86
+ geometry: list[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
87
+ objectness_score: float = Field(..., examples=[0.99])
88
+ lines: list[OCRLine] = Field(
89
+ ...,
90
+ examples=[
91
+ {
92
+ "geometry": [0.0, 0.0, 0.0, 0.0],
93
+ "objectness_score": 0.99,
94
+ "words": [
95
+ {
96
+ "value": "example",
97
+ "geometry": [0.0, 0.0, 0.0, 0.0],
98
+ "confidence": 0.99,
99
+ "crop_orientation": {"value": 0, "confidence": None},
100
+ }
101
+ ],
102
+ }
103
+ ],
104
+ )
105
+
106
+
107
+ class OCRPage(BaseModel):
108
+ blocks: list[OCRBlock] = Field(
109
+ ...,
110
+ examples=[
111
+ {
112
+ "geometry": [0.0, 0.0, 0.0, 0.0],
113
+ "objectness_score": 0.99,
114
+ "lines": [
115
+ {
116
+ "geometry": [0.0, 0.0, 0.0, 0.0],
117
+ "objectness_score": 0.99,
118
+ "words": [
119
+ {
120
+ "value": "example",
121
+ "geometry": [0.0, 0.0, 0.0, 0.0],
122
+ "objectness_score": 0.99,
123
+ "confidence": 0.99,
124
+ "crop_orientation": {"value": 0, "confidence": None},
125
+ }
126
+ ],
127
+ }
128
+ ],
129
+ }
130
+ ],
131
+ )
132
+
133
+
134
+ class OCROut(BaseModel):
135
+ name: str = Field(..., examples=["example.jpg"])
136
+ orientation: dict[str, float | None] = Field(..., examples=[{"value": 0.0, "confidence": 0.99}])
137
+ language: dict[str, str | float | None] = Field(..., examples=[{"value": "en", "confidence": 0.99}])
138
+ dimensions: tuple[int, int] = Field(..., examples=[(100, 100)])
139
+ items: list[OCRPage] = Field(
140
+ ...,
141
+ examples=[
142
+ {
143
+ "geometry": [0.0, 0.0, 0.0, 0.0],
144
+ "objectness_score": 0.99,
145
+ "lines": [
146
+ {
147
+ "geometry": [0.0, 0.0, 0.0, 0.0],
148
+ "objectness_score": 0.99,
149
+ "words": [
150
+ {
151
+ "value": "example",
152
+ "geometry": [0.0, 0.0, 0.0, 0.0],
153
+ "objectness_score": 0.99,
154
+ "confidence": 0.99,
155
+ "crop_orientation": {"value": 0, "confidence": None},
156
+ }
157
+ ],
158
+ }
159
+ ],
160
+ }
161
+ ],
162
+ )
163
+
164
+
165
+ class KIEElement(BaseModel):
166
+ class_name: str = Field(..., examples=["example"])
167
+ items: list[dict[str, str | list[float] | float | dict[str, Any]]] = Field(
168
+ ...,
169
+ examples=[
170
+ {
171
+ "value": "example",
172
+ "geometry": [0.0, 0.0, 0.0, 0.0],
173
+ "objectness_score": 0.99,
174
+ "confidence": 0.99,
175
+ "crop_orientation": {"value": 0, "confidence": None},
176
+ }
177
+ ],
178
+ )
179
+
180
+
181
+ class KIEOut(BaseModel):
182
+ name: str = Field(..., examples=["example.jpg"])
183
+ orientation: dict[str, float | None] = Field(..., examples=[{"value": 0.0, "confidence": 0.99}])
184
+ language: dict[str, str | float | None] = Field(..., examples=[{"value": "en", "confidence": 0.99}])
185
+ dimensions: tuple[int, int] = Field(..., examples=[(100, 100)])
186
+ predictions: list[KIEElement]
src/python-doctr/api/app/utils.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+
7
+ from typing import Any
8
+
9
+ import numpy as np
10
+ from fastapi import UploadFile
11
+
12
+ from doctr.io import DocumentFile
13
+
14
+
15
+ def resolve_geometry(
16
+ geom: Any,
17
+ ) -> tuple[float, float, float, float] | tuple[float, float, float, float, float, float, float, float]:
18
+ if len(geom) == 4:
19
+ return (*geom[0], *geom[1], *geom[2], *geom[3])
20
+ return (*geom[0], *geom[1])
21
+
22
+
23
+ async def get_documents(files: list[UploadFile]) -> tuple[list[np.ndarray], list[str]]: # pragma: no cover
24
+ """Convert a list of UploadFile objects to lists of numpy arrays and their corresponding filenames
25
+
26
+ Args:
27
+ files: list of UploadFile objects
28
+
29
+ Returns:
30
+ tuple[list[np.ndarray], list[str]]: list of numpy arrays and their corresponding filenames
31
+
32
+ """
33
+ filenames = []
34
+ docs = []
35
+ for file in files:
36
+ mime_type = file.content_type
37
+ if mime_type in ["image/jpeg", "image/png"]:
38
+ docs.extend(DocumentFile.from_images([await file.read()]))
39
+ filenames.append(file.filename or "")
40
+ elif mime_type == "application/pdf":
41
+ pdf_content = DocumentFile.from_pdf(await file.read())
42
+ docs.extend(pdf_content)
43
+ filenames.extend([file.filename] * len(pdf_content) or [""] * len(pdf_content))
44
+ else:
45
+ raise ValueError(f"Unsupported file format: {mime_type} for file {file.filename}")
46
+
47
+ return docs, filenames
src/python-doctr/api/app/vision.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2021-2025, Mindee.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+
7
+ from collections.abc import Callable
8
+
9
+ import torch
10
+
11
+ from doctr.models import kie_predictor, ocr_predictor
12
+
13
+ from .schemas import DetectionIn, KIEIn, OCRIn, RecognitionIn
14
+
15
+
16
+ def _move_to_device(predictor: Callable) -> Callable:
17
+ """Move the predictor to the desired device
18
+
19
+ Args:
20
+ predictor: the predictor to move
21
+
22
+ Returns:
23
+ Callable: the predictor moved to the desired device
24
+ """
25
+ return predictor.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
26
+
27
+
28
+ def init_predictor(request: KIEIn | OCRIn | RecognitionIn | DetectionIn) -> Callable:
29
+ """Initialize the predictor based on the request
30
+
31
+ Args:
32
+ request: input request
33
+
34
+ Returns:
35
+ Callable: the predictor
36
+ """
37
+ params = request.model_dump()
38
+ bin_thresh = params.pop("bin_thresh", None)
39
+ box_thresh = params.pop("box_thresh", None)
40
+ if isinstance(request, (OCRIn, RecognitionIn, DetectionIn)):
41
+ predictor = ocr_predictor(pretrained=True, **params)
42
+ predictor.det_predictor.model.postprocessor.bin_thresh = bin_thresh
43
+ predictor.det_predictor.model.postprocessor.box_thresh = box_thresh
44
+ if isinstance(request, DetectionIn):
45
+ return _move_to_device(predictor.det_predictor)
46
+ elif isinstance(request, RecognitionIn):
47
+ return _move_to_device(predictor.reco_predictor)
48
+ return _move_to_device(predictor)
49
+ elif isinstance(request, KIEIn):
50
+ predictor = kie_predictor(pretrained=True, **params)
51
+ predictor.det_predictor.model.postprocessor.bin_thresh = bin_thresh
52
+ predictor.det_predictor.model.postprocessor.box_thresh = box_thresh
53
+ return _move_to_device(predictor)
src/python-doctr/api/docker-compose.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ web:
3
+ container_name: api_web
4
+ build:
5
+ context: .
6
+ dockerfile: Dockerfile
7
+ command: uvicorn app.main:app --reload --workers 1 --host 0.0.0.0 --port 8080
8
+ ports:
9
+ - 8080:8080