space_19

Sleeping

File size: 2,709 Bytes

1d39b8a

import gradio as gr
from transformers import pipeline
import spacy
import language_tool_python
import json
import requests

# Initialize models and tools
nlp = spacy.load("en_core_web_sm")
language_tool = language_tool_python.LanguageTool('en-US')
spell_checker = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base")

def preprocess_and_forward(text: str) -> str:
    processed_text, preprocessing_results = preprocess_text(text)

    try:
        # Forward preprocessed text to context detection (space_9)
        context_response = requests.post(
            "https://api.gradio.app/v2/Frenchizer/space_9/predict",
            json={"data": [processed_text]}
        ).json()
        
        if "error" in context_response:
            return json.dumps({
                "error": "Context detection failed",
                "preprocessing_results": preprocessing_results
            })
        
        context = context_response["data"][0]

        # Return preprocessing and detected context
        result = {
            "preprocessing": preprocessing_results,
            "context": context
        }
        return json.dumps(result)

    except Exception as e:
        return json.dumps({
            "error": str(e),
            "preprocessing_results": preprocessing_results
        })

def preprocess_text(text: str):
    result = {
        "corrections": [],
        "entities": [],
        "tags": [],
        "spell_suggestions": []
    }
    
    # Spell checking
    matches = language_tool.check(text)
    for match in matches:
        if match.replacements:
            result["corrections"].append({
                "original": match.context[match.offsetInContext:match.offsetInContext + match.errorLength],
                "suggestion": match.replacements[0]
            })

    # Transformer-based spell check
    spell_checked = spell_checker(text, max_length=512)[0]['generated_text']
    if spell_checked != text:
        result["spell_suggestions"].append({
            "original": text,
            "corrected": spell_checked
        })

    # NER with spaCy
    doc = nlp(text)
    result["entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]

    # Extract potential tags
    result["tags"] = [token.text for token in doc if token.text.startswith(('#', '@'))]

    return text, result

# Gradio interface
with gr.Blocks() as demo:
    input_text = gr.Textbox(label="Input Text")
    output_json = gr.JSON(label="Processing Results")
    preprocess_button = gr.Button("Process")
    preprocess_button.click(fn=preprocess_and_forward, inputs=[input_text], outputs=[output_json])

if __name__ == "__main__":
    demo.launch()