Spaces:

asdfasdfdsafdsa
/

czech-correction

Paused

App Files Files Community

asdfasdfdsafdsa commited on Oct 3

Commit

508f678

verified ·

1 Parent(s): 1601325

Upload 2 files

Browse files

Files changed (2) hide show

api.py +154 -14
api_client.py +310 -0

api.py CHANGED Viewed

@@ -12,11 +12,18 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForToken
 import time
 import re
 import logging
 from contextlib import asynccontextmanager
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global variables for models
 gec_model = None
@@ -32,7 +39,7 @@ GEC_CONFIG = {
     "length_penalty": 1.0,
     "no_repeat_ngram_size": 0,
     "early_stopping": True,
-    "max_new_tokens": 1500
 }
 @asynccontextmanager
@@ -91,7 +98,7 @@ app.add_middleware(
 # Request/Response models
 class CorrectionRequest(BaseModel):
-    text: str = Field(..., max_length=5000, description="Czech text to correct")
     options: Optional[Dict] = Field(default={}, description="Optional parameters")
 class CorrectionResponse(BaseModel):
@@ -132,7 +139,7 @@ def apply_gec_correction(text: str) -> str:
     inputs = gec_tokenizer(
         text,
         return_tensors="pt",
-        max_length=1024,
         truncation=True
     )
     inputs = {k: v.to(device) for k, v in inputs.items()}
@@ -148,6 +155,52 @@ def apply_gec_correction(text: str) -> str:
     corrected = gec_tokenizer.decode(outputs[0], skip_special_tokens=True)
     return corrected
 def apply_punctuation(text: str) -> str:
     """Apply punctuation and capitalization to text"""
     if not text.strip():
@@ -213,6 +266,79 @@ def apply_punctuation(text: str) -> str:
     return capitalized
 def process_text(text: str) -> str:
     """Full pipeline: GEC + punctuation"""
     # Step 1: Grammar correction
@@ -235,8 +361,8 @@ async def correct_text(request: CorrectionRequest):
         if not request.text.strip():
             raise HTTPException(status_code=400, detail="Text cannot be empty")
-        if len(request.text) > 5000:
-            raise HTTPException(status_code=400, detail="Text too long (max 5000 characters)")
         # Process text
         corrected = process_text(request.text)
@@ -266,7 +392,7 @@ async def correct_text(request: CorrectionRequest):
 @app.post("/api/correct/batch", response_model=BatchCorrectionResponse)
 async def correct_batch(request: BatchCorrectionRequest):
     """
-    Correct multiple Czech texts
     """
     try:
         start_time = time.time()
@@ -275,14 +401,28 @@ async def correct_batch(request: BatchCorrectionRequest):
         if not request.texts:
             raise HTTPException(status_code=400, detail="No texts provided")
-        # Process each text
-        corrected_texts = []
         for text in request.texts:
-            if len(text) > 5000:
-                corrected_texts.append(f"[Error: Text too long]")
             else:
-                corrected = process_text(text)
-                corrected_texts.append(corrected)
         # Calculate processing time
         processing_time = (time.time() - start_time) * 1000
@@ -374,7 +514,7 @@ async def get_info():
             "Batch processing",
             "Czech language focus"
         ],
-        max_input_length=5000
     )
 @app.get("/")
@@ -390,5 +530,5 @@ async def root():
 if __name__ == "__main__":
     import uvicorn
     import os
-    port = int(os.environ.get("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port)

 import time
 import re
 import logging
+import os
 from contextlib import asynccontextmanager
+# Configure CPU threads for model inference (default 12 threads for better performance)
+num_threads = int(os.environ.get("OMP_NUM_THREADS", 12))
+torch.set_num_threads(num_threads)
+torch.set_num_interop_threads(num_threads)
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+logger.info(f"PyTorch configured to use {num_threads} CPU threads")
 # Global variables for models
 gec_model = None
     "length_penalty": 1.0,
     "no_repeat_ngram_size": 0,
     "early_stopping": True,
+    "max_new_tokens": 100000
 }
 @asynccontextmanager
 # Request/Response models
 class CorrectionRequest(BaseModel):
+    text: str = Field(..., max_length=100000, description="Czech text to correct")
     options: Optional[Dict] = Field(default={}, description="Optional parameters")
 class CorrectionResponse(BaseModel):
     inputs = gec_tokenizer(
         text,
         return_tensors="pt",
+        max_length=100000,
         truncation=True
     )
     inputs = {k: v.to(device) for k, v in inputs.items()}
     corrected = gec_tokenizer.decode(outputs[0], skip_special_tokens=True)
     return corrected
+def apply_gec_correction_batch(texts: List[str]) -> List[str]:
+    """Apply grammar error correction to multiple texts (batched for GPU efficiency)"""
+    if not texts:
+        return []
+    # Filter empty texts and track indices
+    non_empty_texts = []
+    non_empty_indices = []
+    results = [""] * len(texts)
+    for i, text in enumerate(texts):
+        if text.strip():
+            non_empty_texts.append(text)
+            non_empty_indices.append(i)
+        else:
+            results[i] = text
+    if not non_empty_texts:
+        return results
+    # Tokenize all texts at once
+    inputs = gec_tokenizer(
+        non_empty_texts,
+        return_tensors="pt",
+        max_length=100000,
+        truncation=True,
+        padding=True
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Generate corrections in batch
+    with torch.no_grad():
+        outputs = gec_model.generate(
+            **inputs,
+            **GEC_CONFIG
+        )
+    # Decode all outputs
+    corrected_texts = gec_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    # Map back to original positions
+    for i, corrected in zip(non_empty_indices, corrected_texts):
+        results[i] = corrected
+    return results
 def apply_punctuation(text: str) -> str:
     """Apply punctuation and capitalization to text"""
     if not text.strip():
     return capitalized
+def apply_punctuation_batch(texts: List[str]) -> List[str]:
+    """Apply punctuation and capitalization to multiple texts (batched for GPU efficiency)"""
+    if not texts:
+        return []
+    results = []
+    for text in texts:
+        if not text.strip():
+            results.append(text)
+            continue
+        # Process with pipeline (pipeline handles batching internally)
+        clean_text = text.lower()
+        pipeline_results = punct_pipeline(clean_text)
+        # Build punctuation map
+        punct_map = {}
+        current_word = ""
+        current_punct = ""
+        for i, result in enumerate(pipeline_results):
+            word = result['word'].replace('▁', '').strip()
+            # Map entity labels to punctuation
+            entity = result['entity']
+            punct_marks = {
+                'LABEL_0': '',
+                'LABEL_1': '.',
+                'LABEL_2': ',',
+                'LABEL_3': '?',
+                'LABEL_4': '-',
+                'LABEL_5': ':'
+            }
+            punct = punct_marks.get(entity, '')
+            # Handle subword tokens
+            if not result['word'].startswith('▁') and i > 0:
+                current_word += word
+            else:
+                if current_word:
+                    punct_map[current_word] = current_punct
+                current_word = word
+                current_punct = punct
+        # Add last word
+        if current_word:
+            punct_map[current_word] = current_punct
+        # Reconstruct with punctuation
+        words = clean_text.split()
+        punctuated = []
+        for word in words:
+            if word in punct_map and punct_map[word]:
+                punctuated.append(word + punct_map[word])
+            else:
+                punctuated.append(word)
+        # Join and capitalize sentences
+        result_text = ' '.join(punctuated)
+        # Capitalize first letter and after sentence endings
+        sentences = re.split(r'(?<=[.?!])\s+', result_text)
+        capitalized = ' '.join(s[0].upper() + s[1:] if s else s for s in sentences)
+        # Clean spacing around punctuation
+        for p in [',', '.', '?', ':', '!', ';']:
+            capitalized = capitalized.replace(f' {p}', p)
+        results.append(capitalized)
+    return results
 def process_text(text: str) -> str:
     """Full pipeline: GEC + punctuation"""
     # Step 1: Grammar correction
         if not request.text.strip():
             raise HTTPException(status_code=400, detail="Text cannot be empty")
+        if len(request.text) > 100000:
+            raise HTTPException(status_code=400, detail="Text too long (max 100000 characters)")
         # Process text
         corrected = process_text(request.text)
 @app.post("/api/correct/batch", response_model=BatchCorrectionResponse)
 async def correct_batch(request: BatchCorrectionRequest):
     """
+    Correct multiple Czech texts (batched for GPU efficiency)
     """
     try:
         start_time = time.time()
         if not request.texts:
             raise HTTPException(status_code=400, detail="No texts provided")
+        # Validate text lengths
+        validated_texts = []
         for text in request.texts:
+            if len(text) > 100000:
+                validated_texts.append("")  # Will be handled as error later
+            else:
+                validated_texts.append(text)
+        # Process all texts in batch (GPU efficient!)
+        # Step 1: Grammar correction (batched)
+        gec_corrected_texts = apply_gec_correction_batch(validated_texts)
+        # Step 2: Punctuation and capitalization (batched)
+        final_texts = apply_punctuation_batch(gec_corrected_texts)
+        # Mark texts that were too long
+        corrected_texts = []
+        for i, text in enumerate(request.texts):
+            if len(text) > 100000:
+                corrected_texts.append("[Error: Text too long]")
             else:
+                corrected_texts.append(final_texts[i])
         # Calculate processing time
         processing_time = (time.time() - start_time) * 1000
             "Batch processing",
             "Czech language focus"
         ],
+        max_input_length=100000
     )
 @app.get("/")
 if __name__ == "__main__":
     import uvicorn
     import os
+    port = int(os.environ.get("PORT", 8042))
     uvicorn.run(app, host="0.0.0.0", port=port)

api_client.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+Client for Czech text correction API with local server auto-start
+"""
+import requests
+import time
+import subprocess
+import os
+import sys
+from pathlib import Path
+from typing import Optional, Dict, List, Any
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class CzechCorrectionClient:
+    """Client for Czech text correction with automatic local server startup"""
+    # Local API endpoint only
+    LOCAL_ENDPOINT = {
+        "name": "Local",
+        "base_url": "http://localhost:8042",
+        "timeout": 3600  # 1 hour for local (grammar correction can be slow)
+    }
+    def __init__(self, prefer_local: bool = True):
+        """
+        Initialize the client
+        Args:
+            prefer_local: Deprecated, always uses local API now
+        """
+        self.endpoint = self.LOCAL_ENDPOINT
+        self._working_endpoint = None
+        self._last_health_check = 0
+        self.health_check_interval = 3600  # Cache endpoint for 1 hour
+        self._server_process = None
+    def _check_endpoint_health(self, endpoint: Dict) -> bool:
+        """Check if an endpoint is healthy"""
+        try:
+            response = requests.get(
+                f"{endpoint['base_url']}/api/health",
+                timeout=10  # Increased timeout for health check
+            )
+            if response.status_code == 200:
+                data = response.json()
+                return data.get('status') == 'healthy'
+        except Exception as e:
+            logger.debug(f"Health check failed for {endpoint['name']}: {e}")
+        return False
+    def _is_port_in_use(self, port: int) -> bool:
+        """Check if a port is already in use"""
+        import socket
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            try:
+                s.bind(('localhost', port))
+                return False
+            except OSError:
+                return True
+    def _start_local_server(self) -> bool:
+        """Start the local API server if not already running"""
+        try:
+            # Check if port 8042 is already in use
+            if self._is_port_in_use(8042):
+                logger.warning("Port 8042 is already in use - server may already be running")
+                # Wait a bit and check health again
+                time.sleep(2)
+                if self._check_endpoint_health(self.endpoint):
+                    logger.info("✅ Server is already running on port 8042")
+                    return True
+                else:
+                    logger.error("Port 8042 is in use but server is not responding to health checks")
+                    return False
+            # Find the api_service directory
+            current_file = Path(__file__).resolve()
+            api_service_dir = current_file.parent
+            api_script = api_service_dir / "api.py"
+            if not api_script.exists():
+                logger.error(f"API script not found at {api_script}")
+                return False
+            logger.info("Starting local API server...")
+            logger.info("This may take 1-2 minutes to load models...")
+            # Start the server in the background
+            env = os.environ.copy()
+            env['PORT'] = '8042'  # Set port to 8042
+            self._server_process = subprocess.Popen(
+                [sys.executable, str(api_script)],
+                cwd=str(api_service_dir),
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                start_new_session=True
+            )
+            # Wait for server to be ready (up to 2 minutes)
+            max_wait = 120
+            start_time = time.time()
+            while time.time() - start_time < max_wait:
+                if self._check_endpoint_health(self.endpoint):
+                    logger.info("✅ Local API server started successfully")
+                    return True
+                time.sleep(2)
+            logger.error("Server failed to start within timeout")
+            return False
+        except Exception as e:
+            logger.error(f"Failed to start local server: {e}")
+            return False
+    def _get_working_endpoint(self) -> Optional[Dict]:
+        """Get working endpoint, starting server if needed"""
+        current_time = time.time()
+        # Use cached endpoint if still valid
+        if self._working_endpoint and (current_time - self._last_health_check < self.health_check_interval):
+            return self._working_endpoint
+        # Check if local server is running
+        if self._check_endpoint_health(self.endpoint):
+            logger.info(f"Using {self.endpoint['name']} API endpoint")
+            self._working_endpoint = self.endpoint
+            self._last_health_check = current_time
+            return self.endpoint
+        # Try to start the server
+        logger.info("Local API server not running, attempting to start...")
+        if self._start_local_server():
+            self._working_endpoint = self.endpoint
+            self._last_health_check = current_time
+            return self.endpoint
+        logger.error("Could not start or connect to local API server")
+        return None
+    def correct_text(self, text: str, include_timing: bool = False) -> Dict[str, Any]:
+        """
+        Correct Czech text (grammar and punctuation)
+        Args:
+            text: Text to correct
+            include_timing: Whether to include processing time in response
+        Returns:
+            Dict with 'success', 'corrected_text', and optionally 'processing_time_ms'
+        """
+        if not text or not text.strip():
+            return {
+                "success": True,
+                "corrected_text": text,
+                "error": None
+            }
+        endpoint = self._get_working_endpoint()
+        if not endpoint:
+            return {
+                "success": False,
+                "corrected_text": text,
+                "error": "Could not start or connect to local API server"
+            }
+        try:
+            payload = {
+                "text": text,
+                "options": {"include_timing": include_timing}
+            }
+            response = requests.post(
+                f"{endpoint['base_url']}/api/correct",
+                json=payload,
+                timeout=endpoint['timeout']
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                return {
+                    "success": False,
+                    "corrected_text": text,
+                    "error": f"API error: {response.status_code}"
+                }
+        except requests.exceptions.Timeout:
+            logger.warning(f"Timeout on {endpoint['name']} API")
+            return {
+                "success": False,
+                "corrected_text": text,
+                "error": "Request timeout"
+            }
+        except Exception as e:
+            logger.error(f"Error calling API: {e}")
+            return {
+                "success": False,
+                "corrected_text": text,
+                "error": str(e)
+            }
+    def correct_batch(self, texts: List[str], include_timing: bool = False) -> Dict[str, Any]:
+        """
+        Correct multiple Czech texts in batch
+        Args:
+            texts: List of texts to correct (max 10)
+            include_timing: Whether to include processing time
+        Returns:
+            Dict with 'success', 'corrected_texts', and optionally 'processing_time_ms'
+        """
+        if not texts:
+            return {
+                "success": True,
+                "corrected_texts": [],
+                "error": None
+            }
+        if len(texts) > 10:
+            return {
+                "success": False,
+                "corrected_texts": texts,
+                "error": "Batch size exceeds limit (10)"
+            }
+        endpoint = self._get_working_endpoint()
+        if not endpoint:
+            return {
+                "success": False,
+                "corrected_texts": texts,
+                "error": "Could not start or connect to local API server"
+            }
+        try:
+            payload = {
+                "texts": texts,
+                "options": {"include_timing": include_timing}
+            }
+            response = requests.post(
+                f"{endpoint['base_url']}/api/correct/batch",
+                json=payload,
+                timeout=endpoint['timeout'] * 2  # Longer timeout for batch
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                # Fallback to individual corrections
+                logger.warning(f"Batch API failed, falling back to individual corrections")
+                corrected_texts = []
+                for text in texts:
+                    result = self.correct_text(text, include_timing=False)
+                    corrected_texts.append(result.get('corrected_text', text))
+                return {
+                    "success": True,
+                    "corrected_texts": corrected_texts,
+                    "error": None
+                }
+        except Exception as e:
+            logger.error(f"Error calling batch API: {e}")
+            # Fallback to individual corrections
+            corrected_texts = []
+            for text in texts:
+                result = self.correct_text(text, include_timing=False)
+                corrected_texts.append(result.get('corrected_text', text))
+            return {
+                "success": True,
+                "corrected_texts": corrected_texts,
+                "error": None
+            }
+# Convenience functions for backward compatibility
+_default_client = None
+def get_client(prefer_local: bool = True) -> CzechCorrectionClient:
+    """Get or create the default client (always uses local now)"""
+    global _default_client
+    if _default_client is None:
+        _default_client = CzechCorrectionClient(prefer_local=True)
+    return _default_client
+def correct_text(text: str, prefer_local: bool = True) -> str:
+    """Simple function for text correction (always uses local now)"""
+    client = get_client(prefer_local=True)
+    result = client.correct_text(text)
+    if result['success']:
+        return result['corrected_text']
+    return text
+def correct_batch(texts: List[str], prefer_local: bool = True) -> List[str]:
+    """Simple function for batch correction (always uses local now)"""
+    client = get_client(prefer_local=True)
+    result = client.correct_batch(texts)
+    if result['success']:
+        return result.get('corrected_texts', texts)
+    return texts