Pclanglais commited on
Commit
9fcaecd
·
verified ·
1 Parent(s): bd727cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -12
app.py CHANGED
@@ -19,6 +19,16 @@ token_classifier = pipeline(
19
 
20
  tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
21
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def split_text(text, max_tokens=500):
24
  # Split the text by newline characters
@@ -64,6 +74,32 @@ def split_text(text, max_tokens=500):
64
 
65
  return chunks
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  # Class to encapsulate the Falcon chatbot
@@ -85,6 +121,7 @@ class MistralChatBot:
85
  batch_prompts = [editorial_text]
86
 
87
  out = token_classifier(batch_prompts)
 
88
  print(out)
89
  generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
90
  return generated_text
@@ -102,18 +139,6 @@ examples = [
102
  ]
103
  ]
104
 
105
- additional_inputs=[
106
- gr.Slider(
107
- label="Température",
108
- value=0.2, # Default value
109
- minimum=0.05,
110
- maximum=1.0,
111
- step=0.05,
112
- interactive=True,
113
- info="Des valeurs plus élevées donne plus de créativité, mais aussi d'étrangeté",
114
- ),
115
- ]
116
-
117
  demo = gr.Blocks()
118
 
119
  with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
 
19
 
20
  tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
21
 
22
+ # Preprocess the 'word' column
23
+ def preprocess_text(text):
24
+ # Remove HTML tags
25
+ text = re.sub(r'<[^>]+>', '', text)
26
+ # Replace newlines with spaces
27
+ text = re.sub(r'\n', ' ', text)
28
+ # Replace multiple spaces with a single space
29
+ text = re.sub(r'\s+', ' ', text)
30
+ # Strip leading and trailing whitespace
31
+ return text.strip()
32
 
33
  def split_text(text, max_tokens=500):
34
  # Split the text by newline characters
 
74
 
75
  return chunks
76
 
77
+ def transform_chunks(marianne_segmentation):
78
+
79
+ # Filter out separators
80
+ marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
81
+
82
+ # Replace '¶' with '\n' and convert to string
83
+ marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
84
+
85
+ #A bit of lceaning.
86
+ marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
87
+
88
+ marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != 'nan']
89
+ marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != '']
90
+ marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != ' ']
91
+
92
+ # Add entity_group as a header to each word
93
+ marianne_segmentation['word'] = '### ' + marianne_segmentation['entity_group'] + ' ###\n' + marianne_segmentation['word']
94
+
95
+ # Group by text_id, identifier, and date, then concatenate words
96
+ marianne_segmentation = marianne_segmentation.agg({
97
+ 'word': lambda x: '\n\n'.join(x.dropna())
98
+ }).reset_index()
99
+
100
+ final_text = marianne_segmentation['word'].tolist()[0]
101
+
102
+ return final_text
103
 
104
 
105
  # Class to encapsulate the Falcon chatbot
 
121
  batch_prompts = [editorial_text]
122
 
123
  out = token_classifier(batch_prompts)
124
+ out = transform_chunks(out)
125
  print(out)
126
  generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
127
  return generated_text
 
139
  ]
140
  ]
141
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  demo = gr.Blocks()
143
 
144
  with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo: