Pclanglais commited on
Commit
4c91de3
·
verified ·
1 Parent(s): 1cf35d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -30
app.py CHANGED
@@ -19,6 +19,27 @@ token_classifier = pipeline(
19
 
20
  tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Preprocess the 'word' column
23
  def preprocess_text(text):
24
  # Remove HTML tags
@@ -75,32 +96,24 @@ def split_text(text, max_tokens=500):
75
  return chunks
76
 
77
  def transform_chunks(marianne_segmentation):
78
-
79
- print(marianne_segmentation)
80
-
81
  marianne_segmentation = pd.DataFrame(marianne_segmentation)
82
-
83
- print(marianne_segmentation)
84
-
85
- # Filter out separators
86
  marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
87
-
88
- # Replace '¶' with '\n' and convert to string
89
  marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
90
-
91
- #A bit of lceaning.
92
  marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
93
-
94
- marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != 'nan']
95
- marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != '']
96
- marianne_segmentation = marianne_segmentation[marianne_segmentation['word'] != ' ']
97
-
98
- # Add entity_group as a header to each word
99
- marianne_segmentation['word'] = '### ' + marianne_segmentation['entity_group'] + ' ###\n' + marianne_segmentation['word']
100
-
101
- final_text = '\n\n'.join(marianne_segmentation['word'].tolist())
 
 
102
 
103
- return final_text
 
104
 
105
 
106
  # Class to encapsulate the Falcon chatbot
@@ -109,22 +122,17 @@ class MistralChatBot:
109
  self.system_prompt = system_prompt
110
 
111
  def predict(self, user_message):
112
- #We drop the newlines.
113
- editorial_text = re.sub("\n", " ¶ ", user_message)
114
-
115
- # Tokenize the prompt and check if it exceeds 500 tokens
116
  num_tokens = len(tokenizer.tokenize(editorial_text))
117
-
118
  if num_tokens > 500:
119
- # Split the prompt into chunks
120
  batch_prompts = split_text(editorial_text, max_tokens=500)
121
  else:
122
  batch_prompts = [editorial_text]
123
-
124
  out = token_classifier(batch_prompts)
125
  out = transform_chunks(out[0])
126
- print(out)
127
- generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
128
  return generated_text
129
 
130
  # Create the Falcon chatbot instance
 
19
 
20
  tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
21
 
22
+ css = """
23
+ <style>
24
+ .manuscript {
25
+ display: flex;
26
+ margin-bottom: 20px;
27
+ }
28
+ .annotation {
29
+ width: 30%;
30
+ padding-right: 20px;
31
+ color: grey;
32
+ font-style: italic;
33
+ }
34
+ .content {
35
+ width: 70%;
36
+ }
37
+ h3 {
38
+ margin-top: 0;
39
+ }
40
+ </style>
41
+ """
42
+
43
  # Preprocess the 'word' column
44
  def preprocess_text(text):
45
  # Remove HTML tags
 
96
  return chunks
97
 
98
  def transform_chunks(marianne_segmentation):
 
 
 
99
  marianne_segmentation = pd.DataFrame(marianne_segmentation)
 
 
 
 
100
  marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
 
 
101
  marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
 
 
102
  marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
103
+ marianne_segmentation = marianne_segmentation[marianne_segmentation['word'].notna() & (marianne_segmentation['word'] != '') & (marianne_segmentation['word'] != ' ')]
104
+
105
+ html_output = []
106
+ for _, row in marianne_segmentation.iterrows():
107
+ entity_group = row['entity_group']
108
+ word = row['word']
109
+
110
+ if entity_group == 'title':
111
+ html_output.append(f'<div class="manuscript"><div class="annotation">{entity_group}</div><div class="content"><h3>{word}</h3></div></div>')
112
+ else:
113
+ html_output.append(f'<div class="manuscript"><div class="annotation">{entity_group}</div><div class="content">{word}</div></div>')
114
 
115
+ final_html = '\n'.join(html_output)
116
+ return final_html
117
 
118
 
119
  # Class to encapsulate the Falcon chatbot
 
122
  self.system_prompt = system_prompt
123
 
124
  def predict(self, user_message):
125
+ editorial_text = re.sub("\n", " ¶ ", user_message)
 
 
 
126
  num_tokens = len(tokenizer.tokenize(editorial_text))
127
+
128
  if num_tokens > 500:
 
129
  batch_prompts = split_text(editorial_text, max_tokens=500)
130
  else:
131
  batch_prompts = [editorial_text]
132
+
133
  out = token_classifier(batch_prompts)
134
  out = transform_chunks(out[0])
135
+ generated_text = f'{css}<h2 style="text-align:center">Réponse</h2>\n<div class="generation">{out}</div>'
 
136
  return generated_text
137
 
138
  # Create the Falcon chatbot instance