lauraparra28 commited on
Commit
2e1aa7a
·
verified ·
1 Parent(s): fd9eec8

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ embeddings/embeddings.xlsx filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import functions as fn
3
+ import json
4
+
5
+ data = fn.load_embeddings()
6
+ num_documents = data['num_documents']
7
+ num_segment_contents = data['num_segment_contents']
8
+
9
+ with open("gradio.json", encoding='utf-8') as f:
10
+ config = json.load(f)
11
+ config['description'] = config['description'].format(num_documents=num_documents, num_segment_contents=num_segment_contents)
12
+
13
+ def on_submit(query, history):
14
+ response = fn.rag_response(query, data=data, detailed_response=False)
15
+ return gr.HTML(response.replace("\n", "<br>"))
16
+
17
+ demo = gr.ChatInterface(fn=on_submit, **config)
18
+
19
+ demo.launch()
app_details.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import functions as fn
3
+ import json
4
+
5
+ data = fn.load_embeddings()
6
+ num_documents = data['num_documents']
7
+ num_segment_contents = data['num_segment_contents']
8
+
9
+ with open("gradio.json") as f:
10
+ config = json.load(f)
11
+ config['description'] = config['description'].format(num_documents=num_documents, num_segment_contents=num_segment_contents)
12
+ config['title'] += " - Interface de Respostas Detalhadas"
13
+
14
+ def on_submit(query, history):
15
+ response = fn.rag_response(query, data=data, detailed_response=True)
16
+ return gr.HTML(response.replace("\n", "<br>"))
17
+
18
+ demo = gr.ChatInterface(fn=on_submit, **config)
19
+
20
+ demo.launch()
docs/dar_normas_academicas.txt ADDED
Binary file (262 kB). View file
 
docs/posgraduacao_stritosensu_regulamento.txt ADDED
Binary file (98.5 kB). View file
 
documents_names.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ { "dar_normas_academicas.txt": ["DAR - Normas Acadêmicas", "https://www.puc-rio.br/sobrepuc/depto/dar/download/dar_normas_academicas.pdf"],
2
+ "posgraduacao_stritosensu_regulamento.txt": ["Regulamento dos Programas de Pós-Graduação da PUC-Rio", "https://www.puc-rio.br/ensinopesq/ccpg/download/posgraduacao_stritosensu_regulamento.pdf"]
3
+ }
embeddings.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
2
+ from sentence_transformers import SentenceTransformer
3
+ import os
4
+ import sys
5
+ import glob
6
+ import torch
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
11
+ sys.path.append(parent_dir)
12
+
13
+ import functions as fn
14
+
15
+ def get_embeddings(chunk_size, chunk_overlap, model_name, input_path='docs/*.txt', output_path='embeddings/embeddings.xlsx'):
16
+
17
+ text_splitter = RecursiveCharacterTextSplitter(
18
+ chunk_size=chunk_size,
19
+ chunk_overlap=chunk_overlap,
20
+ length_function=len,
21
+ is_separator_regex=False,
22
+ )
23
+
24
+ all_splitted_text = []
25
+ file_names = []
26
+
27
+ for file in glob.glob(input_path):
28
+ text = fn.load_text(file)
29
+ splitted_text = text_splitter.create_documents([text])
30
+ all_splitted_text.extend(splitted_text)
31
+ file_names.extend([os.path.basename(file)] * len(splitted_text))
32
+
33
+ model = SentenceTransformer(model_name)
34
+
35
+ embeddings_list = []
36
+ content_list = []
37
+ file_name_list = []
38
+ model_name_list = []
39
+
40
+ for segment, file_name in tqdm(zip(all_splitted_text, file_names), desc="Procesando segmentos"):
41
+ embeddings = model.encode(segment.page_content)
42
+ embeddings_list.append(embeddings)
43
+ content_list.append(segment.page_content)
44
+ file_name_list.append(file_name)
45
+ model_name_list.append(model_name)
46
+
47
+ embeddings_df = pd.DataFrame(embeddings_list)
48
+ embeddings_df['segment_content'] = content_list
49
+ embeddings_df['file_name'] = file_name_list
50
+ embeddings_df['model_name'] = model_name_list
51
+
52
+ embeddings_df.to_excel(output_path, index=False)
53
+
54
+ if __name__ == "__main__":
55
+ current_dir = os.getcwd()
56
+ get_embeddings(chunk_size=512, chunk_overlap=100, model_name='intfloat/multilingual-e5-large')
embeddings/embeddings.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03f5000178f061609ae23a245440a8cd7638769dbc9ee642b25f413b7c088664
3
+ size 7187184
functions.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chardet
2
+ import torch
3
+ from langchain_openai import ChatOpenAI, OpenAI
4
+ from langchain_core.prompts import PromptTemplate
5
+ from langchain.prompts import PromptTemplate
6
+ from sentence_transformers import SentenceTransformer
7
+ import os
8
+ import pandas as pd
9
+ import json
10
+
11
+ current_dir = os.getcwd()
12
+
13
+ def load_api_key(file_path):
14
+ with open(file_path, 'r', encoding='utf-8') as file:
15
+ data = json.load(file)
16
+ return data.get('api_key')
17
+
18
+ def load_dictionary(json_path):
19
+ with open(json_path, 'r', encoding='utf-8') as file:
20
+ return json.load(file)
21
+
22
+ def detect_encoding(file_path):
23
+ with open(file_path, 'rb') as file:
24
+ raw_data = file.read()
25
+ result = chardet.detect(raw_data)
26
+ return result['encoding']
27
+
28
+ def load_text(file_path):
29
+ encoding = detect_encoding(file_path)
30
+ with open(file_path, 'r', encoding=encoding) as file:
31
+ return file.read()
32
+
33
+ def search_query(query, embeddings_tensor, model, segment_contents, file_names, k=5):
34
+ query_embedding = torch.tensor(model.encode(query)).unsqueeze(0)
35
+ similarities = torch.mm(query_embedding, embeddings_tensor.t()).squeeze(0)
36
+ topk_similarities, topk_indices = torch.topk(similarities, k)
37
+
38
+ top_segments = [segment_contents[idx] for idx in topk_indices]
39
+ top_file_names = [file_names[idx] for idx in topk_indices]
40
+ top_similarities = topk_similarities.tolist()
41
+
42
+ return top_segments, top_file_names, top_similarities
43
+
44
+ def load_embeddings(file_path="embeddings/embeddings.xlsx"):
45
+ embeddings_df = pd.read_excel(os.path.join(current_dir, file_path))
46
+ embeddings = embeddings_df.iloc[:, :-3].values
47
+ segment_contents = embeddings_df['segment_content'].values
48
+ num_segment_contents = len(segment_contents)
49
+ num_documents = embeddings_df['file_name'].nunique()
50
+ file_names = embeddings_df['file_name'].values
51
+ model_name = embeddings_df['model_name'].values[0]
52
+
53
+ return {
54
+ "embeddings": embeddings,
55
+ "segment_contents": segment_contents,
56
+ "num_documents": num_documents,
57
+ "num_segment_contents": num_segment_contents,
58
+ "file_names": file_names,
59
+ "model_name": model_name,
60
+ }
61
+
62
+ def generate_answer_with_references(query, data):
63
+ embeddings = data["embeddings"]
64
+ segment_contents = data["segment_contents"]
65
+ model_name = data["model_name"]
66
+ file_names = data["file_names"]
67
+ embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)
68
+ model = SentenceTransformer(model_name)
69
+ dictionary_path = os.path.join(current_dir, 'documents_names.json')
70
+ file_name_dict = load_dictionary(dictionary_path)
71
+ file_names = [file_name_dict.get(name, name) for name in file_names]
72
+
73
+ top_segments, top_file_names, top_similarities = search_query(query, embeddings_tensor, model, segment_contents, file_names, k=5)
74
+ context = "\n----\n".join(top_segments)
75
+ prompt_template = """
76
+ Você é um assistente de inteligência artificial que responde a perguntas baseadas nos documentos de forma detalhada na forma culta da língua portuguesa.
77
+ Não é possível gerar informações ou fornecer informações que não estejam contidas nos documentos recuperados.
78
+ Se a informação não se encontra nos documentos, responda com: Não foi possível encontrar a informação requerida nos documentos.
79
+
80
+ Contexto:
81
+
82
+ {context}
83
+
84
+ Pergunta: {query}
85
+
86
+ Resposta:""".format(context=context, query=query)
87
+
88
+ qa_prompt = PromptTemplate.from_template(prompt_template)
89
+ api_key = load_api_key('api_key.json')
90
+
91
+ llm = ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo")
92
+ response = llm.invoke(qa_prompt.template)
93
+ resposta = response.content
94
+ total_tokens = response.response_metadata['token_usage']['total_tokens']
95
+ prompt_tokens = response.response_metadata['token_usage']['prompt_tokens']
96
+
97
+ return resposta, total_tokens, prompt_tokens, top_segments, top_file_names, top_similarities, prompt_template
98
+
99
+ def rag_response(query, data, detailed_response):
100
+ resposta, total_tokens, prompt_tokens, top_segments, top_file_names, top_similarities, prompt_template = generate_answer_with_references(query, data)
101
+ file_names = [x[0] for x in top_file_names]
102
+ file_links = {x[0]: x[1] for x in top_file_names}
103
+
104
+ if detailed_response==True:
105
+ references_detail = "\n\n".join([
106
+ f"* Segmento: {segment}\nArquivo: <a href='{file_links[file_name]}' target='_blank'>{file_name}</a>\nSimilaridade: {similarity:.4f}"
107
+ for segment, file_name, similarity in zip(top_segments, file_names, top_similarities)])
108
+
109
+ formatted_detailed_response = f"Resposta:\n\n{resposta}\n\nPrompt:\n{prompt_template}\n\nPrompt Tokens: {prompt_tokens}\nTotal Tokens: {total_tokens}\n\n{references_detail}"
110
+
111
+ return formatted_detailed_response
112
+ else:
113
+ file_set = set(file_name for file_name in file_names)
114
+ references = "\n".join("<a href='{}' target='_blank'>{}</a>".format(file_links[file_name], file_name) for file_name in file_set)
115
+ formatted_response = f"{resposta}\n\n----\n{references}"
116
+ return formatted_response
gradio.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "Chatbot PUC-Rio",
3
+ "description": "<center>O assistente tem acesso a {num_documents} documentos ({num_segment_contents} parágrafos)</center>",
4
+ "examples": [
5
+ ["Quando deve ser renovada a matrícula?"],
6
+ ["O que é o histórico escolar?"], ["Como posso virar aluno da PUC-Rio?"],
7
+ ["Quais são os requisitos de proficiência linguística para os alunos de mestrado no programa?"],
8
+ ["Como faço para cancelar uma disciplina?"], ["A PUC-Rio tem curso de medicina?"]
9
+ ],
10
+ "theme": "gradio/default",
11
+ "submit_btn": "Enviar",
12
+ "stop_btn": "Parar",
13
+ "retry_btn": "🔄 Tentar novamente",
14
+ "undo_btn": "↩️ Desfazer",
15
+ "clear_btn": "🗑️ Limpar"
16
+ }
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chardet==5.2.0
2
+ torch==2.3.0
3
+ langchain_text_splitters
4
+ sentence-transformers==3.0.1
5
+ pandas
6
+ tqdm
7
+ openpyxl
8
+ gradio==4.37.1
9
+ langchain-openai
10
+ langchain-core
11
+ langchain