Spaces:
Runtime error
Runtime error
Create a version for summarization using LangChain tools.
Browse files- functions.py +46 -7
functions.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1 |
import os
|
2 |
-
import requests
|
3 |
import random
|
|
|
|
|
4 |
import torch
|
5 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
6 |
from peft import PeftConfig, PeftModel
|
7 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer,
|
8 |
-
|
9 |
|
10 |
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
11 |
|
@@ -21,6 +26,8 @@ shared = {
|
|
21 |
'full_text': None,
|
22 |
}
|
23 |
|
|
|
|
|
24 |
|
25 |
def get_nearest_examples(question: str, k: int):
|
26 |
print(['get_nearest_examples', 'start'])
|
@@ -81,7 +88,33 @@ def split_text(text: str):
|
|
81 |
return lines
|
82 |
|
83 |
|
84 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
print(['summarize_text', 'start'])
|
86 |
input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: '
|
87 |
batch = tokenizer(input_text, return_tensors='pt')
|
@@ -145,6 +178,7 @@ def answer_question(question: str):
|
|
145 |
max_new_tokens=256,
|
146 |
generation_config=generation_config)
|
147 |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
|
|
|
148 |
print(['answer_question', 'end'])
|
149 |
return output
|
150 |
|
@@ -165,7 +199,7 @@ def load_model(peft_model_id):
|
|
165 |
return model, tokenizer
|
166 |
|
167 |
|
168 |
-
def load_embeddings_model(model_ckpt:str):
|
169 |
print(['load_embeddings_model', 'start'])
|
170 |
print(['load_embeddings_model', 'loading tokenizer'])
|
171 |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
@@ -176,5 +210,10 @@ def load_embeddings_model(model_ckpt:str):
|
|
176 |
return model, tokenizer
|
177 |
|
178 |
|
179 |
-
model, tokenizer = load_model(
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
2 |
import random
|
3 |
+
|
4 |
+
import requests
|
5 |
import torch
|
6 |
from bs4 import BeautifulSoup
|
7 |
+
from datasets import Dataset
|
8 |
+
from langchain.docstore.document import Document
|
9 |
+
from langchain.llms import HuggingFacePipeline
|
10 |
+
from langchain.text_splitter import CharacterTextSplitter
|
11 |
from peft import PeftConfig, PeftModel
|
12 |
+
from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
|
13 |
+
GenerationConfig, pipeline)
|
14 |
|
15 |
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
16 |
|
|
|
26 |
'full_text': None,
|
27 |
}
|
28 |
|
29 |
+
text_splitter = CharacterTextSplitter()
|
30 |
+
|
31 |
|
32 |
def get_nearest_examples(question: str, k: int):
|
33 |
print(['get_nearest_examples', 'start'])
|
|
|
88 |
return lines
|
89 |
|
90 |
|
91 |
+
def remove_prompt(text: str) -> str:
|
92 |
+
output_prompt = 'Output: '
|
93 |
+
idx = text.index(output_prompt)
|
94 |
+
res = text[idx + len(output_prompt):].strip()
|
95 |
+
res = res.replace('Input: ', '')
|
96 |
+
return res
|
97 |
+
|
98 |
+
|
99 |
+
def summarize_text(text: str) -> str:
|
100 |
+
print(['summarize_text', 'start'])
|
101 |
+
|
102 |
+
print(['summarize_text', 'splitting text'])
|
103 |
+
texts = text_splitter.split_text(text)
|
104 |
+
docs = [Document(page_content=t) for t in texts]
|
105 |
+
prompts = [f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {d.page_content}\nOutput: '
|
106 |
+
for d in docs]
|
107 |
+
|
108 |
+
print(['summarize_text', 'generating'])
|
109 |
+
cleaned_summaries = [remove_prompt(
|
110 |
+
s['generated_text']) for s in pipe(prompts)]
|
111 |
+
summaries = '\n\n'.join(cleaned_summaries)
|
112 |
+
|
113 |
+
print(['summarize_text', 'end'])
|
114 |
+
return summaries
|
115 |
+
|
116 |
+
|
117 |
+
def summarize_text_v1(text: str):
|
118 |
print(['summarize_text', 'start'])
|
119 |
input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: '
|
120 |
batch = tokenizer(input_text, return_tensors='pt')
|
|
|
178 |
max_new_tokens=256,
|
179 |
generation_config=generation_config)
|
180 |
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
|
181 |
+
output = output.replace(input_text, '')
|
182 |
print(['answer_question', 'end'])
|
183 |
return output
|
184 |
|
|
|
199 |
return model, tokenizer
|
200 |
|
201 |
|
202 |
+
def load_embeddings_model(model_ckpt: str):
|
203 |
print(['load_embeddings_model', 'start'])
|
204 |
print(['load_embeddings_model', 'loading tokenizer'])
|
205 |
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
|
|
210 |
return model, tokenizer
|
211 |
|
212 |
|
213 |
+
model, tokenizer = load_model(
|
214 |
+
"hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
|
215 |
+
pipe = pipeline("text2text-generation", model=model,
|
216 |
+
tokenizer=tokenizer, max_new_tokens=100)
|
217 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
218 |
+
emb_model, emb_tokenizer = load_embeddings_model(
|
219 |
+
"sentence-transformers/multi-qa-mpnet-base-dot-v1")
|