Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
import spacy | |
import language_tool_python | |
import json | |
import requests | |
# Initialize models and tools | |
nlp = spacy.load("en_core_web_sm") | |
language_tool = language_tool_python.LanguageTool('en-US') | |
spell_checker = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base") | |
def preprocess_and_forward(text: str) -> str: | |
processed_text, preprocessing_results = preprocess_text(text) | |
try: | |
# Forward preprocessed text to context detection (space_9) | |
context_response = requests.post( | |
"https://api.gradio.app/v2/Frenchizer/space_9/predict", | |
json={"data": [processed_text]} | |
).json() | |
if "error" in context_response: | |
return json.dumps({ | |
"error": "Context detection failed", | |
"preprocessing_results": preprocessing_results | |
}) | |
context = context_response["data"][0] | |
# Return preprocessing and detected context | |
result = { | |
"preprocessing": preprocessing_results, | |
"context": context | |
} | |
return json.dumps(result) | |
except Exception as e: | |
return json.dumps({ | |
"error": str(e), | |
"preprocessing_results": preprocessing_results | |
}) | |
def preprocess_text(text: str): | |
result = { | |
"corrections": [], | |
"entities": [], | |
"tags": [], | |
"spell_suggestions": [] | |
} | |
# Spell checking | |
matches = language_tool.check(text) | |
for match in matches: | |
if match.replacements: | |
result["corrections"].append({ | |
"original": match.context[match.offsetInContext:match.offsetInContext + match.errorLength], | |
"suggestion": match.replacements[0] | |
}) | |
# Transformer-based spell check | |
spell_checked = spell_checker(text, max_length=512)[0]['generated_text'] | |
if spell_checked != text: | |
result["spell_suggestions"].append({ | |
"original": text, | |
"corrected": spell_checked | |
}) | |
# NER with spaCy | |
doc = nlp(text) | |
result["entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents] | |
# Extract potential tags | |
result["tags"] = [token.text for token in doc if token.text.startswith(('#', '@'))] | |
return text, result | |
# Gradio interface | |
with gr.Blocks() as demo: | |
input_text = gr.Textbox(label="Input Text") | |
output_json = gr.JSON(label="Processing Results") | |
preprocess_button = gr.Button("Process") | |
preprocess_button.click(fn=preprocess_and_forward, inputs=[input_text], outputs=[output_json]) | |
if __name__ == "__main__": | |
demo.launch() | |