Spaces:

MJobe
/

document-vqa-v2

Sleeping

App Files Files Community

MJobe commited on Nov 12, 2024

Commit

31d9e37

verified ·

1 Parent(s): fe58c62

Update main.py

Browse files

Files changed (1) hide show

main.py +89 -0

main.py CHANGED Viewed

@@ -39,6 +39,8 @@ nlp_classification = pipeline("text-classification", model="distilbert/distilber
 nlp_classification_v2 = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
 nlp_speech_to_text = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 nlp_sequence_classification = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 description = """
 ## Image-based Document QA
 This API performs document question answering using a LayoutLMv2-based model.
@@ -365,6 +367,93 @@ async def fast_classify_text(statement: str = Form(...)):
     except Exception as e:
         # Handle general errors
         return JSONResponse(content=f"Error in classification pipeline: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins

 nlp_classification_v2 = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
 nlp_speech_to_text = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 nlp_sequence_classification = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+nlp_main_classification = pipeline("zero-shot-classification", model="roberta-large-mnli")
 description = """
 ## Image-based Document QA
 This API performs document question answering using a LayoutLMv2-based model.
     except Exception as e:
         # Handle general errors
         return JSONResponse(content=f"Error in classification pipeline: {str(e)}", status_code=500)
+# Labels for main classification
+labels = [
+    "Change to quote",
+    "Copy quote requested",
+    "Expired Quote",
+    "Notes not clear"
+]
+# Keywords for sub-classifications
+keyword_map = {
+    "MRSP": ["MSRP", "MRSP copy quote", "msrp only"],
+    "Direct": ["Direct quote", "send directly"],
+    "All": ["All Pricing", "all pricing"],
+    "MRSP & All": ["MSRP & All Pricing", "msrp only with all pricing"]
+}
+# Function to detect if input is blank or vague
+def is_blank_or_vague(text):
+    # Checks for empty or only contains general filler words (adjust as needed)
+    return not text.strip() or re.match(r'^\s*(please|send|quote|request|thank you|thanks)\s*$', text, re.IGNORECASE)
+# Function to identify sub-classifications based on keywords
+def get_sub_classification(text):
+    sub_labels = []
+    for sub_class, keywords in keyword_map.items():
+        if any(keyword.lower() in text.lower() for keyword in keywords):
+            sub_labels.append(sub_class)
+    return sub_labels if sub_labels else ["Uncategorized"]
+@app.post("/classify_text/")
+async def classify_text(statement: str = Form(...)):
+    try:
+        # Handle blank or vague text as "Notes not clear"
+        if is_blank_or_vague(statement):
+            return {
+                "main_classification": {
+                    "label": "Notes not clear",
+                    "confidence": 1.0,
+                    "scores": {"Notes not clear": 1.0}
+                },
+                "sub_classification": {
+                    "labels": ["Uncategorized"],
+                    "scores": {"Uncategorized": 1.0}
+                }
+            }
+        # Run main classification in executor for async handling
+        loop = asyncio.get_running_loop()
+        main_classification_task = loop.run_in_executor(
+            None,
+            lambda: nlp_main_classification(statement, labels)
+        )
+        # Await result
+        main_class_result = await main_classification_task
+        # Extract main classification label and scores
+        main_class_scores = {label: score for label, score in zip(main_class_result["labels"], main_class_result["scores"])}
+        best_main_classification = main_class_result["labels"][0]
+        best_main_score = main_class_result["scores"][0]
+        # Detect sub-classifications using keywords
+        sub_classification = get_sub_classification(statement)
+        # Assign default high confidence for keyword-based sub-classification
+        sub_class_scores = {sub: 1.0 for sub in sub_classification}
+        # Return results
+        return {
+            "main_classification": {
+                "label": best_main_classification,
+                "confidence": best_main_score,
+                "scores": main_class_scores
+            },
+            "sub_classification": {
+                "labels": sub_classification,
+                "scores": sub_class_scores
+            }
+        }
+    except asyncio.TimeoutError:
+        return JSONResponse(content="Classification timed out.", status_code=504)
+    except HTTPException as http_exc:
+        return JSONResponse(content=f"HTTP error: {http_exc.detail}", status_code=http_exc.status_code)
+    except Exception as e:
+        return JSONResponse(content=f"Error in classification pipeline: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins