luanpoppe commited on
Commit
16867c3
·
1 Parent(s): b287766

feat: tentando melhorar espaçamento da resposta final

Browse files
Files changed (1) hide show
  1. _utils/utils.py +91 -78
_utils/utils.py CHANGED
@@ -16,68 +16,74 @@ import openai
16
  import pandas as pd
17
  import markdown
18
 
19
- os.environ["LANGCHAIN_TRACING_V2"]="true"
20
- os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
21
  os.environ.get("LANGCHAIN_API_KEY")
22
- os.environ["LANGCHAIN_PROJECT"]="VELLA"
23
  os.environ.get("OPENAI_API_KEY")
24
  os.environ.get("HUGGINGFACEHUB_API_TOKEN")
25
- embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
 
 
26
 
27
  allIds = []
28
 
 
29
  def getPDF(file_paths):
30
- documentId = 0
31
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
32
- pages = []
33
- for file in file_paths:
34
- loader = PyPDFLoader(file, extract_images=False)
35
- pagesDoc = loader.load_and_split(text_splitter)
36
- pages = pages + pagesDoc
37
-
38
- for page in pages:
39
- documentId = str(uuid4())
40
- allIds.append(documentId)
41
- page.id = documentId
42
- return pages
 
43
 
44
  def create_retriever(documents, vectorstore):
45
- print('\n\n')
46
- print('documents: ', documents[:2])
47
 
48
- vectorstore.add_documents(documents=documents)
 
 
 
 
 
 
 
49
 
50
- retriever = vectorstore.as_retriever(
51
- # search_type="similarity",
52
- # search_kwargs={"k": 3},
53
- )
54
-
55
- return retriever
56
 
57
  def create_prompt_llm_chain(system_prompt, modelParam):
58
- model = create_llm(modelParam)
59
-
60
- system_prompt = system_prompt + "\n\n" + "{context}"
61
- prompt = ChatPromptTemplate.from_messages(
62
- [
63
- ("system", system_prompt),
64
- ("human", "{input}"),
65
- ]
66
- )
67
- question_answer_chain = create_stuff_documents_chain(model, prompt)
68
- return question_answer_chain
 
69
 
70
  def create_llm(modelParam):
71
- if modelParam == default_model:
72
- return ChatOpenAI(model=modelParam, max_tokens=16384)
73
- else:
74
- return HuggingFaceEndpoint(
75
- repo_id=modelParam,
76
- task="text-generation",
77
- max_new_tokens=1100,
78
- do_sample=False,
79
- huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")
80
- )
81
 
82
 
83
  class Resumo(BaseModel):
@@ -87,46 +93,49 @@ class Resumo(BaseModel):
87
  doutrina: str = Field()
88
  palavras_chave: List[str] = Field()
89
 
 
90
  def create_prompt_llm_chain_summary(system_prompt, model_param):
91
- prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
 
 
 
 
 
 
92
 
93
- question_answer_chain = create_stuff_documents_chain(prompt_and_llm["model"], prompt_and_llm["prompt"])
94
- final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
95
- return final_chain
96
 
97
  def process_embedding_summary(system_prompt, model_param, full_text):
98
- prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
99
-
100
- text_splitter=RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
101
- docs = text_splitter.create_documents([full_text])
102
- embeddings=get_embeddings([doc.page_content for doc in docs])
103
-
104
- content_list = [doc.page_content for doc in docs]
105
- df = pd.DataFrame(content_list, columns=['page_content'])
106
- vectors = [embedding.embedding for embedding in embeddings]
107
- array = np.array(vectors)
108
- embeddings_series = pd.Series(list(array))
109
- df['embeddings'] = embeddings_series
110
 
111
 
112
  def get_embeddings(text):
113
- response = openai.embeddings.create(
114
- model="text-embedding-3-small",
115
- input=text
116
- )
117
- return response.data
118
 
119
  def create_prompt_and_llm(system_prompt, model_param):
120
- model = create_llm(model_param)
121
-
122
- system_prompt = system_prompt + "\n\n" + "{context}"
123
- prompt = ChatPromptTemplate.from_messages(
124
- [
125
- ("system", system_prompt),
126
- ("human", "{input}"),
127
- ]
128
- )
129
- return {"model": model, "prompt": prompt}
 
130
 
131
  DEFAULT_SYSTEM_PROMPT = """
132
 
@@ -201,6 +210,10 @@ def convert_markdown_to_HTML(text: str):
201
  .replace("<diagnostico_processual>", "")
202
  .replace("</diagnostico_processual>", "")
203
  .replace("xml", "")
 
 
 
 
204
  .replace("\n", "\n\n")
205
  )
206
  html = markdown.markdown(texto_inicial)
 
16
  import pandas as pd
17
  import markdown
18
 
19
+ os.environ["LANGCHAIN_TRACING_V2"] = "true"
20
+ os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
21
  os.environ.get("LANGCHAIN_API_KEY")
22
+ os.environ["LANGCHAIN_PROJECT"] = "VELLA"
23
  os.environ.get("OPENAI_API_KEY")
24
  os.environ.get("HUGGINGFACEHUB_API_TOKEN")
25
+ embeddings_model = HuggingFaceEmbeddings(
26
+ model_name="sentence-transformers/all-mpnet-base-v2"
27
+ )
28
 
29
  allIds = []
30
 
31
+
32
  def getPDF(file_paths):
33
+ documentId = 0
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
35
+ pages = []
36
+ for file in file_paths:
37
+ loader = PyPDFLoader(file, extract_images=False)
38
+ pagesDoc = loader.load_and_split(text_splitter)
39
+ pages = pages + pagesDoc
40
+
41
+ for page in pages:
42
+ documentId = str(uuid4())
43
+ allIds.append(documentId)
44
+ page.id = documentId
45
+ return pages
46
+
47
 
48
  def create_retriever(documents, vectorstore):
49
+ print("\n\n")
50
+ print("documents: ", documents[:2])
51
 
52
+ vectorstore.add_documents(documents=documents)
53
+
54
+ retriever = vectorstore.as_retriever(
55
+ # search_type="similarity",
56
+ # search_kwargs={"k": 3},
57
+ )
58
+
59
+ return retriever
60
 
 
 
 
 
 
 
61
 
62
  def create_prompt_llm_chain(system_prompt, modelParam):
63
+ model = create_llm(modelParam)
64
+
65
+ system_prompt = system_prompt + "\n\n" + "{context}"
66
+ prompt = ChatPromptTemplate.from_messages(
67
+ [
68
+ ("system", system_prompt),
69
+ ("human", "{input}"),
70
+ ]
71
+ )
72
+ question_answer_chain = create_stuff_documents_chain(model, prompt)
73
+ return question_answer_chain
74
+
75
 
76
  def create_llm(modelParam):
77
+ if modelParam == default_model:
78
+ return ChatOpenAI(model=modelParam, max_tokens=16384)
79
+ else:
80
+ return HuggingFaceEndpoint(
81
+ repo_id=modelParam,
82
+ task="text-generation",
83
+ max_new_tokens=1100,
84
+ do_sample=False,
85
+ huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
86
+ )
87
 
88
 
89
  class Resumo(BaseModel):
 
93
  doutrina: str = Field()
94
  palavras_chave: List[str] = Field()
95
 
96
+
97
  def create_prompt_llm_chain_summary(system_prompt, model_param):
98
+ prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
99
+
100
+ question_answer_chain = create_stuff_documents_chain(
101
+ prompt_and_llm["model"], prompt_and_llm["prompt"]
102
+ )
103
+ final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
104
+ return final_chain
105
 
 
 
 
106
 
107
  def process_embedding_summary(system_prompt, model_param, full_text):
108
+ prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)
109
+
110
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
111
+ docs = text_splitter.create_documents([full_text])
112
+ embeddings = get_embeddings([doc.page_content for doc in docs])
113
+
114
+ content_list = [doc.page_content for doc in docs]
115
+ df = pd.DataFrame(content_list, columns=["page_content"])
116
+ vectors = [embedding.embedding for embedding in embeddings]
117
+ array = np.array(vectors)
118
+ embeddings_series = pd.Series(list(array))
119
+ df["embeddings"] = embeddings_series
120
 
121
 
122
  def get_embeddings(text):
123
+ response = openai.embeddings.create(model="text-embedding-3-small", input=text)
124
+ return response.data
125
+
 
 
126
 
127
  def create_prompt_and_llm(system_prompt, model_param):
128
+ model = create_llm(model_param)
129
+
130
+ system_prompt = system_prompt + "\n\n" + "{context}"
131
+ prompt = ChatPromptTemplate.from_messages(
132
+ [
133
+ ("system", system_prompt),
134
+ ("human", "{input}"),
135
+ ]
136
+ )
137
+ return {"model": model, "prompt": prompt}
138
+
139
 
140
  DEFAULT_SYSTEM_PROMPT = """
141
 
 
210
  .replace("<diagnostico_processual>", "")
211
  .replace("</diagnostico_processual>", "")
212
  .replace("xml", "")
213
+ .replace("<li>\n", "<li>")
214
+ .replace("<ol>\n<li>", "<ol><li>")
215
+ .replace("</li>\n</ol>", "</li></ol>")
216
+ .replace("</li>\n<li>", "</li><li>")
217
  .replace("\n", "\n\n")
218
  )
219
  html = markdown.markdown(texto_inicial)