milyiyo commited on
Commit
c5a914e
·
1 Parent(s): 3a41c52

Create a version for summarization using LangChain tools.

Browse files
Files changed (1) hide show
  1. functions.py +46 -7
functions.py CHANGED
@@ -1,11 +1,16 @@
1
  import os
2
- import requests
3
  import random
 
 
4
  import torch
5
  from bs4 import BeautifulSoup
 
 
 
 
6
  from peft import PeftConfig, PeftModel
7
- from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, AutoModel
8
- from datasets import DatasetDict, Dataset
9
 
10
  # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
11
 
@@ -21,6 +26,8 @@ shared = {
21
  'full_text': None,
22
  }
23
 
 
 
24
 
25
  def get_nearest_examples(question: str, k: int):
26
  print(['get_nearest_examples', 'start'])
@@ -81,7 +88,33 @@ def split_text(text: str):
81
  return lines
82
 
83
 
84
- def summarize_text(text: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  print(['summarize_text', 'start'])
86
  input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: '
87
  batch = tokenizer(input_text, return_tensors='pt')
@@ -145,6 +178,7 @@ def answer_question(question: str):
145
  max_new_tokens=256,
146
  generation_config=generation_config)
147
  output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
 
148
  print(['answer_question', 'end'])
149
  return output
150
 
@@ -165,7 +199,7 @@ def load_model(peft_model_id):
165
  return model, tokenizer
166
 
167
 
168
- def load_embeddings_model(model_ckpt:str):
169
  print(['load_embeddings_model', 'start'])
170
  print(['load_embeddings_model', 'loading tokenizer'])
171
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
@@ -176,5 +210,10 @@ def load_embeddings_model(model_ckpt:str):
176
  return model, tokenizer
177
 
178
 
179
- model, tokenizer = load_model("hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
180
- emb_model, emb_tokenizer = load_embeddings_model("sentence-transformers/multi-qa-mpnet-base-dot-v1")
 
 
 
 
 
 
1
  import os
 
2
  import random
3
+
4
+ import requests
5
  import torch
6
  from bs4 import BeautifulSoup
7
+ from datasets import Dataset
8
+ from langchain.docstore.document import Document
9
+ from langchain.llms import HuggingFacePipeline
10
+ from langchain.text_splitter import CharacterTextSplitter
11
  from peft import PeftConfig, PeftModel
12
+ from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
13
+ GenerationConfig, pipeline)
14
 
15
  # os.environ["CUDA_VISIBLE_DEVICES"] = "0"
16
 
 
26
  'full_text': None,
27
  }
28
 
29
+ text_splitter = CharacterTextSplitter()
30
+
31
 
32
  def get_nearest_examples(question: str, k: int):
33
  print(['get_nearest_examples', 'start'])
 
88
  return lines
89
 
90
 
91
+ def remove_prompt(text: str) -> str:
92
+ output_prompt = 'Output: '
93
+ idx = text.index(output_prompt)
94
+ res = text[idx + len(output_prompt):].strip()
95
+ res = res.replace('Input: ', '')
96
+ return res
97
+
98
+
99
+ def summarize_text(text: str) -> str:
100
+ print(['summarize_text', 'start'])
101
+
102
+ print(['summarize_text', 'splitting text'])
103
+ texts = text_splitter.split_text(text)
104
+ docs = [Document(page_content=t) for t in texts]
105
+ prompts = [f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {d.page_content}\nOutput: '
106
+ for d in docs]
107
+
108
+ print(['summarize_text', 'generating'])
109
+ cleaned_summaries = [remove_prompt(
110
+ s['generated_text']) for s in pipe(prompts)]
111
+ summaries = '\n\n'.join(cleaned_summaries)
112
+
113
+ print(['summarize_text', 'end'])
114
+ return summaries
115
+
116
+
117
+ def summarize_text_v1(text: str):
118
  print(['summarize_text', 'start'])
119
  input_text = f'<s>Instruction: Elabora un resume del siguiente texto.\nInput: {text}\nOutput: '
120
  batch = tokenizer(input_text, return_tensors='pt')
 
178
  max_new_tokens=256,
179
  generation_config=generation_config)
180
  output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
181
+ output = output.replace(input_text, '')
182
  print(['answer_question', 'end'])
183
  return output
184
 
 
199
  return model, tokenizer
200
 
201
 
202
+ def load_embeddings_model(model_ckpt: str):
203
  print(['load_embeddings_model', 'start'])
204
  print(['load_embeddings_model', 'loading tokenizer'])
205
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
 
210
  return model, tokenizer
211
 
212
 
213
+ model, tokenizer = load_model(
214
+ "hackathon-somos-nlp-2023/opt-6.7b-lora-sag-t3000-v300-v2")
215
+ pipe = pipeline("text2text-generation", model=model,
216
+ tokenizer=tokenizer, max_new_tokens=100)
217
+ llm = HuggingFacePipeline(pipeline=pipe)
218
+ emb_model, emb_tokenizer = load_embeddings_model(
219
+ "sentence-transformers/multi-qa-mpnet-base-dot-v1")