LightOnOCR

Paused

App Files Files Community

IFMedTechdemo commited on 27 days ago

Commit

f574169

verified ·

1 Parent(s): 3c37a52

Update app.py

Browse files

Files changed (1) hide show

app.py +344 -22

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
 import subprocess
 import sys
@@ -16,6 +16,7 @@ from transformers import (
     LightOnOCRProcessor,
 )
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 device = "cuda" if torch.cuda.is_available() else "cpu"
 if device == "cuda":
@@ -79,7 +80,6 @@ def clean_output_text(text):
     return cleaned
 def preprocess_image_for_ocr(image):
-    """Convert PIL.Image to adaptive thresholded image for OCR."""
     image_rgb = image.convert("RGB")
     img_np = np.array(image_rgb)
     gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
@@ -94,9 +94,58 @@ def preprocess_image_for_ocr(image):
     preprocessed_pil = Image.fromarray(adaptive_threshold)
     return preprocessed_pil
 @spaces.GPU
 def extract_text_from_image(image, temperature=0.2):
-    """OCR + clinical NER, with preprocessing."""
     processed_img = preprocess_image_for_ocr(image)
     chat = [
         {
@@ -133,22 +182,11 @@ def extract_text_from_image(image, temperature=0.2):
     )
     with torch.no_grad():
         outputs = ocr_model.generate(**generation_kwargs)
     output_text = processor.decode(outputs[0], skip_special_tokens=True)
     cleaned_text = clean_output_text(output_text)
-    entities = ner_pipeline(cleaned_text)
-    medications = []
-    for ent in entities:
-        if ent["entity_group"] == "treatment":
-            word = ent["word"]
-            if word.startswith("##") and medications:
-                medications[-1] += word[2:]
-            else:
-                medications.append(word)
-    medications_str = ", ".join(set(medications)) if medications else "None detected"
-    yield cleaned_text, medications_str, output_text, processed_img
-def process_input(file_input, temperature, page_num):
     if file_input is None:
         yield "Please upload an image or PDF first.", "", "", "", "No file!", 1
         return
@@ -176,11 +214,13 @@ def process_input(file_input, temperature, page_num):
             yield msg, "", msg, "", None, slider_value
             return
     try:
-        for cleaned_text, medications, raw_md, processed_img in extract_text_from_image(
             image_to_process, temperature
         ):
-            yield cleaned_text, medications, raw_md, page_info, processed_img, slider_value
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
         yield error_msg, "", error_msg, page_info, image_to_process, slider_value
@@ -218,6 +258,12 @@ with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo
         label="Page Number (PDF only)",
         interactive=True
     )
     output_text = gr.Textbox(
         label="📝 Extracted Text",
         lines=4,
@@ -229,7 +275,7 @@ with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo
         label="💊 Extracted Medicines/Drugs",
         placeholder="Medicine/drug names will appear here...",
         lines=2,
-        max_lines=5,
         interactive=False,
         show_copy_button=True
     )
@@ -240,7 +286,7 @@ with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo
         interactive=False
     )
     page_info = gr.Markdown(
-        value="" # Info of PDF page
     )
     rendered_image = gr.Image(
         label="Processed Image (Thresholded for OCR)",
@@ -253,7 +299,7 @@ with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo
     submit_btn.click(
         fn=process_input,
-        inputs=[file_input, temperature, page_slider],
         outputs=[output_text, medicines_output, raw_output, page_info, rendered_image, num_pages]
     )
@@ -268,6 +314,282 @@ if __name__ == "__main__":
 # Create Gradio interface
 # with gr.Blocks(title="📖 Image/PDF OCR with LightOnOCR", theme=gr.themes.Soft()) as demo:

+#################################################################################################
 import subprocess
 import sys
     LightOnOCRProcessor,
 )
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+import re
 device = "cuda" if torch.cuda.is_available() else "cpu"
 if device == "cuda":
     return cleaned
 def preprocess_image_for_ocr(image):
     image_rgb = image.convert("RGB")
     img_np = np.array(image_rgb)
     gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
     preprocessed_pil = Image.fromarray(adaptive_threshold)
     return preprocessed_pil
+def extract_medication_lines(text):
+    """
+    Extracts medication/drug lines from text using regex.
+    Matches lines beginning with tab, tablet, cap, capsule, syrup, syp, oral, inj, injection, ointment, drops, patch, sol, solution, etc.
+    Handles case-insensitivity and abbreviations like T., C., tab., cap. etc.
+    """
+    # "|" means OR. (?:...) is a non-capturing group.
+    pattern = r"""^\s*    # Leading spaces allowed
+    (
+        T\.?|TAB\.?|TABLET      # T., T, TAB, TAB., TABLET
+        |C\.?|CAP\.?|CAPSULE    # C., C, CAP, CAP., CAPSULE
+        |SYRUP|SYP
+        |ORAL
+        |INJ\.?|INJECTION       # INJ., INJ, INJECTION
+        |OINTMENT|DROPS|PATCH|SOL\.?|SOLUTION
+    )
+    \s+[A-Z0-9 \-\(\)/,.]+      # Name/dose/other info (at least one space/letter after the pattern)
+    """
+    # Compile with re.IGNORECASE and re.VERBOSE for readability
+    med_regex = re.compile(pattern, re.IGNORECASE | re.VERBOSE)
+    meds = []
+    for line in text.split('\n'):
+        line = line.strip()
+        if med_regex.match(line):
+            meds.append(line)
+    return '\n'.join(meds)
+def extract_meds(text, use_ner):
+    """
+    Switches between Clinical NER or regex extraction.
+    Returns medications string.
+    """
+    if use_ner:
+        entities = ner_pipeline(text)
+        meds = []
+        for ent in entities:
+            if ent["entity_group"] == "treatment":
+                word = ent["word"]
+                if word.startswith("##") and meds:
+                    meds[-1] += word[2:]
+                else:
+                    meds.append(word)
+        return ", ".join(set(meds)) if meds else "None detected"
+    else:
+        return extract_medication_lines(text) or "None detected"
 @spaces.GPU
 def extract_text_from_image(image, temperature=0.2):
+    """OCR with adaptive thresholding."""
     processed_img = preprocess_image_for_ocr(image)
     chat = [
         {
     )
     with torch.no_grad():
         outputs = ocr_model.generate(**generation_kwargs)
     output_text = processor.decode(outputs[0], skip_special_tokens=True)
     cleaned_text = clean_output_text(output_text)
+    yield cleaned_text, output_text, processed_img
+def process_input(file_input, temperature, page_num, extraction_mode):
     if file_input is None:
         yield "Please upload an image or PDF first.", "", "", "", "No file!", 1
         return
             yield msg, "", msg, "", None, slider_value
             return
+    use_ner = extraction_mode == "Regex"   #"Clinical NER"
     try:
+        for cleaned_text, raw_md, processed_img in extract_text_from_image(
             image_to_process, temperature
         ):
+            meds_out = extract_meds(cleaned_text, use_ner)
+            yield cleaned_text, meds_out, raw_md, page_info, processed_img, slider_value
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
         yield error_msg, "", error_msg, page_info, image_to_process, slider_value
         label="Page Number (PDF only)",
         interactive=True
     )
+    extraction_mode = gr.Radio(
+        choices=["Clinical NER", "Regex"],
+        value="Regex",
+        label="Extraction Method",
+        info="Clinical NER uses ML, Regex uses rules"
+    )
     output_text = gr.Textbox(
         label="📝 Extracted Text",
         lines=4,
         label="💊 Extracted Medicines/Drugs",
         placeholder="Medicine/drug names will appear here...",
         lines=2,
+        max_lines=10,
         interactive=False,
         show_copy_button=True
     )
         interactive=False
     )
     page_info = gr.Markdown(
+        value=""  # Info of PDF page
     )
     rendered_image = gr.Image(
         label="Processed Image (Thresholded for OCR)",
     submit_btn.click(
         fn=process_input,
+        inputs=[file_input, temperature, page_slider, extraction_mode],
         outputs=[output_text, medicines_output, raw_output, page_info, rendered_image, num_pages]
     )
+####################################################  running code only NER #######################
+#!/usr/bin/env python3
+# import subprocess
+# import sys
+# import spaces
+# import torch
+# import gradio as gr
+# from PIL import Image
+# import numpy as np
+# import cv2
+# import pypdfium2 as pdfium
+# from transformers import (
+#     LightOnOCRForConditionalGeneration,
+#     LightOnOCRProcessor,
+# )
+# from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# if device == "cuda":
+#     attn_implementation = "sdpa"
+#     dtype = torch.bfloat16
+# else:
+#     attn_implementation = "eager"
+#     dtype = torch.float32
+# ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
+#     "lightonai/LightOnOCR-1B-1025",
+#     attn_implementation=attn_implementation,
+#     torch_dtype=dtype,
+#     trust_remote_code=True,
+# ).to(device).eval()
+# processor = LightOnOCRProcessor.from_pretrained(
+#     "lightonai/LightOnOCR-1B-1025",
+#     trust_remote_code=True,
+# )
+# ner_tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+# ner_model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+# ner_pipeline = pipeline(
+#     "ner",
+#     model=ner_model,
+#     tokenizer=ner_tokenizer,
+#     aggregation_strategy="simple",
+# )
+# def render_pdf_page(page, max_resolution=1540, scale=2.77):
+#     width, height = page.get_size()
+#     pixel_width = width * scale
+#     pixel_height = height * scale
+#     resize_factor = min(1, max_resolution / pixel_width, max_resolution / pixel_height)
+#     target_scale = scale * resize_factor
+#     return page.render(scale=target_scale, rev_byteorder=True).to_pil()
+# def process_pdf(pdf_path, page_num=1):
+#     pdf = pdfium.PdfDocument(pdf_path)
+#     total_pages = len(pdf)
+#     page_idx = min(max(int(page_num) - 1, 0), total_pages - 1)
+#     page = pdf[page_idx]
+#     img = render_pdf_page(page)
+#     pdf.close()
+#     return img, total_pages, page_idx + 1
+# def clean_output_text(text):
+#     markers_to_remove = ["system", "user", "assistant"]
+#     lines = text.split('\n')
+#     cleaned_lines = []
+#     for line in lines:
+#         stripped = line.strip()
+#         if stripped.lower() not in markers_to_remove:
+#             cleaned_lines.append(line)
+#     cleaned = '\n'.join(cleaned_lines).strip()
+#     if "assistant" in text.lower():
+#         parts = text.split("assistant", 1)
+#         if len(parts) > 1:
+#             cleaned = parts[1].strip()
+#     return cleaned
+# def preprocess_image_for_ocr(image):
+#     """Convert PIL.Image to adaptive thresholded image for OCR."""
+#     image_rgb = image.convert("RGB")
+#     img_np = np.array(image_rgb)
+#     gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
+#     adaptive_threshold = cv2.adaptiveThreshold(
+#         gray,
+#         255,
+#         cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+#         cv2.THRESH_BINARY,
+#         85,
+#         11,
+#     )
+#     preprocessed_pil = Image.fromarray(adaptive_threshold)
+#     return preprocessed_pil
+# @spaces.GPU
+# def extract_text_from_image(image, temperature=0.2):
+#     """OCR + clinical NER, with preprocessing."""
+#     processed_img = preprocess_image_for_ocr(image)
+#     chat = [
+#         {
+#             "role": "user",
+#             "content": [
+#                 {"type": "image", "image": processed_img}
+#             ],
+#         }
+#     ]
+#     inputs = processor.apply_chat_template(
+#         chat,
+#         add_generation_prompt=True,
+#         tokenize=True,
+#         return_dict=True,
+#         return_tensors="pt",
+#     )
+#     # Move inputs to device
+#     inputs = {
+#         k: (
+#             v.to(device=device, dtype=dtype)
+#             if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
+#             else v.to(device)
+#             if isinstance(v, torch.Tensor)
+#             else v
+#         )
+#         for k, v in inputs.items()
+#     }
+#     generation_kwargs = dict(
+#         **inputs,
+#         max_new_tokens=2048,
+#         temperature=temperature if temperature > 0 else 0.0,
+#         use_cache=True,
+#         do_sample=temperature > 0,
+#     )
+#     with torch.no_grad():
+#         outputs = ocr_model.generate(**generation_kwargs)
+#     output_text = processor.decode(outputs[0], skip_special_tokens=True)
+#     cleaned_text = clean_output_text(output_text)
+#     entities = ner_pipeline(cleaned_text)
+#     medications = []
+#     for ent in entities:
+#         if ent["entity_group"] == "treatment":
+#             word = ent["word"]
+#             if word.startswith("##") and medications:
+#                 medications[-1] += word[2:]
+#             else:
+#                 medications.append(word)
+#     medications_str = ", ".join(set(medications)) if medications else "None detected"
+#     yield cleaned_text, medications_str, output_text, processed_img
+# def process_input(file_input, temperature, page_num):
+#     if file_input is None:
+#         yield "Please upload an image or PDF first.", "", "", "", "No file!", 1
+#         return
+#     image_to_process = None
+#     page_info = ""
+#     slider_value = page_num
+#     file_path = file_input if isinstance(file_input, str) else file_input.name
+#     if file_path.lower().endswith(".pdf"):
+#         try:
+#             image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
+#             page_info = f"Processing page {actual_page} of {total_pages}"
+#             slider_value = actual_page
+#         except Exception as e:
+#             msg = f"Error processing PDF: {str(e)}"
+#             yield msg, "", msg, "", None, slider_value
+#             return
+#     else:
+#         try:
+#             image_to_process = Image.open(file_path)
+#             page_info = "Processing image"
+#         except Exception as e:
+#             msg = f"Error opening image: {str(e)}"
+#             yield msg, "", msg, "", None, slider_value
+#             return
+#     try:
+#         for cleaned_text, medications, raw_md, processed_img in extract_text_from_image(
+#             image_to_process, temperature
+#         ):
+#             yield cleaned_text, medications, raw_md, page_info, processed_img, slider_value
+#     except Exception as e:
+#         error_msg = f"Error during text extraction: {str(e)}"
+#         yield error_msg, "", error_msg, page_info, image_to_process, slider_value
+# def update_slider(file_input):
+#     if file_input is None:
+#         return gr.update(maximum=20, value=1)
+#     file_path = file_input if isinstance(file_input, str) else file_input.name
+#     if file_path.lower().endswith('.pdf'):
+#         try:
+#             pdf = pdfium.PdfDocument(file_path)
+#             total_pages = len(pdf)
+#             pdf.close()
+#             return gr.update(maximum=total_pages, value=1)
+#         except:
+#             return gr.update(maximum=20, value=1)
+#     else:
+#         return gr.update(maximum=1, value=1)
+# with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo:
+#     file_input = gr.File(
+#         label="🖼️ Upload Image or PDF",
+#         file_types=[".pdf", ".png", ".jpg", ".jpeg"],
+#         type="filepath"
+#     )
+#     temperature = gr.Slider(
+#         minimum=0.0,
+#         maximum=1.0,
+#         value=0.2,
+#         step=0.05,
+#         label="Temperature"
+#     )
+#     page_slider = gr.Slider(
+#         minimum=1, maximum=20, value=1, step=1,
+#         label="Page Number (PDF only)",
+#         interactive=True
+#     )
+#     output_text = gr.Textbox(
+#         label="📝 Extracted Text",
+#         lines=4,
+#         max_lines=10,
+#         interactive=False,
+#         show_copy_button=True
+#     )
+#     medicines_output = gr.Textbox(
+#         label="💊 Extracted Medicines/Drugs",
+#         placeholder="Medicine/drug names will appear here...",
+#         lines=2,
+#         max_lines=5,
+#         interactive=False,
+#         show_copy_button=True
+#     )
+#     raw_output = gr.Textbox(
+#         label="Raw Model Output",
+#         lines=2,
+#         max_lines=5,
+#         interactive=False
+#     )
+#     page_info = gr.Markdown(
+#         value="" # Info of PDF page
+#     )
+#     rendered_image = gr.Image(
+#         label="Processed Image (Thresholded for OCR)",
+#         interactive=False
+#     )
+#     num_pages = gr.Number(
+#         value=1, label="Current Page (slider)", visible=False
+#     )
+#     submit_btn = gr.Button("Extract Medicines", variant="primary")
+#     submit_btn.click(
+#         fn=process_input,
+#         inputs=[file_input, temperature, page_slider],
+#         outputs=[output_text, medicines_output, raw_output, page_info, rendered_image, num_pages]
+#     )
+#     file_input.change(
+#         fn=update_slider,
+#         inputs=[file_input],
+#         outputs=[page_slider]
+#     )
+# if __name__ == "__main__":
+#     demo.launch()
+##########################################   #############################################################
 # Create Gradio interface
 # with gr.Blocks(title="📖 Image/PDF OCR with LightOnOCR", theme=gr.themes.Soft()) as demo: