TorchTransformers-CV-SFT

Sleeping

App Files Files Community

awacke1 commited on Mar 30

Commit

99b2de2

verified ·

1 Parent(s): 2cbf123

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -20

app.py CHANGED Viewed

@@ -208,22 +208,31 @@ async def process_pdf_snapshot(pdf_path, mode="single"):
         status.error(f"Failed to process PDF: {str(e)}")
         return []
-async def process_ocr(image, output_file):
     start_time = time.time()
     status = st.empty()
-    status.text("Processing GOT-OCR2_0... (0s)")
-    tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)
-    # Force CPU usage to avoid CUDA error until GPU setup is fixed
-    model = AutoModel.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True, torch_dtype=torch.float32).to("cpu").eval()
-    temp_file = f"temp_{int(time.time())}.png"
-    image.save(temp_file)
-    result = model.chat(tokenizer, temp_file, ocr_type='ocr')
-    os.remove(temp_file)
-    elapsed = int(time.time() - start_time)
-    status.text(f"GOT-OCR2_0 completed in {elapsed}s!")
-    async with aiofiles.open(output_file, "w") as f:
-        await f.write(result)
-    return result
 async def process_image_gen(prompt, output_file):
     start_time = time.time()
@@ -373,7 +382,7 @@ with tab_ocr:
     all_files = get_gallery_files()
     if all_files:
         if st.button("OCR All Assets 🚀"):
-            full_text = "# OCR Results\n\n"
             for file in all_files:
                 if file.endswith('.png'):
                     image = Image.open(file)
@@ -383,7 +392,7 @@ with tab_ocr:
                     image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                     doc.close()
                 output_file = generate_filename(f"ocr_{os.path.basename(file)}", "txt")
-                result = asyncio.run(process_ocr(image, output_file))
                 full_text += f"## {os.path.basename(file)}\n\n{result}\n\n"
                 entry = f"OCR Test: {file} -> {output_file}"
                 st.session_state['history'].append(entry)
@@ -405,7 +414,7 @@ with tab_ocr:
             if st.button("Run OCR 🚀", key="ocr_run"):
                 output_file = generate_filename("ocr_output", "txt")
                 st.session_state['processing']['ocr'] = True
-                result = asyncio.run(process_ocr(image, output_file))
                 entry = f"OCR Test: {selected_file} -> {output_file}"
                 st.session_state['history'].append(entry)
                 st.text_area("OCR Result", result, height=200, key="ocr_result")
@@ -418,7 +427,7 @@ with tab_ocr:
                     pix = doc[i].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
                     image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                     output_file = generate_filename(f"ocr_page_{i}", "txt")
-                    result = asyncio.run(process_ocr(image, output_file))
                     full_text += f"## Page {i + 1}\n\n{result}\n\n"
                     entry = f"OCR Test: {selected_file} Page {i + 1} -> {output_file}"
                     st.session_state['history'].append(entry)
@@ -454,7 +463,7 @@ with tab_build:
         entry = f"Built {model_type} model: {model_name}"
         st.session_state['history'].append(entry)
         st.success(f"Model downloaded and saved to {config.model_path}! 🎉")
-        st.experimental_rerun()
 with tab_imggen:
     st.header("Test Image Gen 🎨")
@@ -644,7 +653,7 @@ def update_gallery():
                     os.remove(file)
                     st.session_state['asset_checkboxes'].pop(file, None)
                     st.success(f"Asset {os.path.basename(file)} vaporized! 💨")
-                    st.experimental_rerun()
 update_gallery()

         status.error(f"Failed to process PDF: {str(e)}")
         return []
+async def process_gpt4o_ocr(image, output_file):
     start_time = time.time()
     status = st.empty()
+    status.text("Processing GPT-4o OCR... (0s)")
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Extract the electronic text from this image."},
+            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}", "detail": "auto"}}
+        ]
+    }]
+    try:
+        response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=300)
+        result = response.choices[0].message.content
+        elapsed = int(time.time() - start_time)
+        status.text(f"GPT-4o OCR completed in {elapsed}s!")
+        async with aiofiles.open(output_file, "w") as f:
+            await f.write(result)
+        return result
+    except Exception as e:
+        status.error(f"Failed to process image with GPT-4o: {str(e)}")
+        return ""
 async def process_image_gen(prompt, output_file):
     start_time = time.time()
     all_files = get_gallery_files()
     if all_files:
         if st.button("OCR All Assets 🚀"):
+            full_text = "# OCR Results (GPT-4o)\n\n"
             for file in all_files:
                 if file.endswith('.png'):
                     image = Image.open(file)
                     image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                     doc.close()
                 output_file = generate_filename(f"ocr_{os.path.basename(file)}", "txt")
+                result = asyncio.run(process_gpt4o_ocr(image, output_file))
                 full_text += f"## {os.path.basename(file)}\n\n{result}\n\n"
                 entry = f"OCR Test: {file} -> {output_file}"
                 st.session_state['history'].append(entry)
             if st.button("Run OCR 🚀", key="ocr_run"):
                 output_file = generate_filename("ocr_output", "txt")
                 st.session_state['processing']['ocr'] = True
+                result = asyncio.run(process_gpt4o_ocr(image, output_file))
                 entry = f"OCR Test: {selected_file} -> {output_file}"
                 st.session_state['history'].append(entry)
                 st.text_area("OCR Result", result, height=200, key="ocr_result")
                     pix = doc[i].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
                     image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                     output_file = generate_filename(f"ocr_page_{i}", "txt")
+                    result = asyncio.run(process_gpt4o_ocr(image, output_file))
                     full_text += f"## Page {i + 1}\n\n{result}\n\n"
                     entry = f"OCR Test: {selected_file} Page {i + 1} -> {output_file}"
                     st.session_state['history'].append(entry)
         entry = f"Built {model_type} model: {model_name}"
         st.session_state['history'].append(entry)
         st.success(f"Model downloaded and saved to {config.model_path}! 🎉")
+        st.rerun()
 with tab_imggen:
     st.header("Test Image Gen 🎨")
                     os.remove(file)
                     st.session_state['asset_checkboxes'].pop(file, None)
                     st.success(f"Asset {os.path.basename(file)} vaporized! 💨")
+                    st.rerun()
 update_gallery()