Spaces:
Runtime error
Runtime error
File size: 4,044 Bytes
750020e 11b325d 750020e 80a2f6a 750020e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import transformers
import re
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
import torch
import gradio as gr
import json
import os
import shutil
import requests
import pandas as pd
# Define the device
device = "cuda" if torch.cuda.is_available() else "cpu"
editorial_model = "PleIAs/Estienne"
token_classifier = pipeline(
"token-classification", model=editorial_model, aggregation_strategy="simple", device=device
)
tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
def split_text(text, max_tokens=500):
# Split the text by newline characters
parts = text.split("\n")
chunks = []
current_chunk = ""
for part in parts:
# Add part to current chunk
if current_chunk:
temp_chunk = current_chunk + "\n" + part
else:
temp_chunk = part
# Tokenize the temporary chunk
num_tokens = len(tokenizer.tokenize(temp_chunk))
if num_tokens <= max_tokens:
current_chunk = temp_chunk
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = part
if current_chunk:
chunks.append(current_chunk)
# If no newlines were found and still exceeding max_tokens, split further
if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
long_text = chunks[0]
chunks = []
while len(tokenizer.tokenize(long_text)) > max_tokens:
split_point = len(long_text) // 2
while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
split_point += 1
# Ensure split_point does not go out of range
if split_point >= len(long_text):
split_point = len(long_text) - 1
chunks.append(long_text[:split_point].strip())
long_text = long_text[split_point:].strip()
if long_text:
chunks.append(long_text)
return chunks
# Class to encapsulate the Falcon chatbot
class MistralChatBot:
def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
self.system_prompt = system_prompt
def predict(self, user_message):
#We drop the newlines.
editorial_text = re.sub("\n", " ¶ ", user_message)
# Tokenize the prompt and check if it exceeds 500 tokens
num_tokens = len(tokenizer.tokenize(prompt))
if num_tokens > 500:
# Split the prompt into chunks
batch_prompts = split_text(prompt, max_tokens=500)
else:
batch_prompts = [prompt]
out = token_classifier(batch_prompts)
out = "".join(out)
generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + html_diff + "</div>"
return generated_text
# Create the Falcon chatbot instance
mistral_bot = MistralChatBot()
# Define the Gradio interface
title = "Éditorialisation"
description = "Un outil expérimental d'identification de la structure du texte à partir d'un encoder (Deberta)"
examples = [
[
"Qui peut bénéficier de l'AIP?", # user_message
0.7 # temperature
]
]
additional_inputs=[
gr.Slider(
label="Température",
value=0.2, # Default value
minimum=0.05,
maximum=1.0,
step=0.05,
interactive=True,
info="Des valeurs plus élevées donne plus de créativité, mais aussi d'étrangeté",
),
]
demo = gr.Blocks()
with gr.Blocks(theme='JohnSmith9982/small_and_pretty', css=css) as demo:
gr.HTML("""<h1 style="text-align:center">Correction d'OCR</h1>""")
text_input = gr.Textbox(label="Votre texte.", type="text", lines=1)
text_button = gr.Button("Identifier les structures éditoriales")
text_output = gr.HTML(label="Le texte corrigé")
text_button.click(mistral_bot.predict, inputs=text_input, outputs=[text_output])
if __name__ == "__main__":
demo.queue().launch() |