Spaces:
Sleeping
Sleeping
luanpoppe
commited on
Commit
·
234f840
1
Parent(s):
55f46c1
feat: adicionando melhorias e correções do contextual retriever
Browse files
_antigos/resumos/serializer.py
CHANGED
@@ -25,5 +25,5 @@ class ResumoCursorSerializer(serializers.Serializer):
|
|
25 |
user_message = serializers.CharField(required=False, default="")
|
26 |
model = serializers.CharField(required=False, default=default_model)
|
27 |
hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
|
28 |
-
chunk_size = serializers.IntegerField(required=False, default=
|
29 |
-
chunk_overlap = serializers.IntegerField(required=False, default=
|
|
|
25 |
user_message = serializers.CharField(required=False, default="")
|
26 |
model = serializers.CharField(required=False, default=default_model)
|
27 |
hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
|
28 |
+
chunk_size = serializers.IntegerField(required=False, default=5000)
|
29 |
+
chunk_overlap = serializers.IntegerField(required=False, default=1600)
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
@@ -174,7 +174,8 @@ class ContextualRetriever:
|
|
174 |
ContextualizedChunk(
|
175 |
content=chunk.content,
|
176 |
page_number=chunk.page_number,
|
177 |
-
|
|
|
178 |
start_char=chunk.start_char,
|
179 |
end_char=chunk.end_char,
|
180 |
context=" ".join(result[index][1:2]),
|
|
|
174 |
ContextualizedChunk(
|
175 |
content=chunk.content,
|
176 |
page_number=chunk.page_number,
|
177 |
+
id_do_processo=result[index][0],
|
178 |
+
chunk_id=chunk.chunk_id,
|
179 |
start_char=chunk.start_char,
|
180 |
end_char=chunk.end_char,
|
181 |
context=" ".join(result[index][1:2]),
|
_utils/models/gerar_relatorio.py
CHANGED
@@ -10,6 +10,7 @@ class DocumentChunk:
|
|
10 |
chunk_id: str
|
11 |
start_char: int
|
12 |
end_char: int
|
|
|
13 |
|
14 |
|
15 |
@dataclass
|
|
|
10 |
chunk_id: str
|
11 |
start_char: int
|
12 |
end_char: int
|
13 |
+
id_do_processo: int = 0
|
14 |
|
15 |
|
16 |
@dataclass
|
_utils/vector_stores/Vector_store_class.py
CHANGED
@@ -21,7 +21,7 @@ class VectorStore:
|
|
21 |
# Prepare texts with context
|
22 |
if is_contextualized_chunk:
|
23 |
texts = [
|
24 |
-
f"Document_id: {chunk.
|
25 |
for chunk in chunks
|
26 |
]
|
27 |
else:
|
@@ -35,6 +35,7 @@ class VectorStore:
|
|
35 |
metadatas.append(
|
36 |
{
|
37 |
"chunk_id": chunk.chunk_id,
|
|
|
38 |
"page": chunk.page_number,
|
39 |
"start_char": chunk.start_char,
|
40 |
"end_char": chunk.end_char,
|
@@ -46,6 +47,7 @@ class VectorStore:
|
|
46 |
metadatas.append(
|
47 |
{
|
48 |
"chunk_id": chunk.chunk_id,
|
|
|
49 |
"page": chunk.page_number,
|
50 |
"start_char": chunk.start_char,
|
51 |
"end_char": chunk.end_char,
|
|
|
21 |
# Prepare texts with context
|
22 |
if is_contextualized_chunk:
|
23 |
texts = [
|
24 |
+
f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
|
25 |
for chunk in chunks
|
26 |
]
|
27 |
else:
|
|
|
35 |
metadatas.append(
|
36 |
{
|
37 |
"chunk_id": chunk.chunk_id,
|
38 |
+
"id_do_processo": chunk.id_do_processo,
|
39 |
"page": chunk.page_number,
|
40 |
"start_char": chunk.start_char,
|
41 |
"end_char": chunk.end_char,
|
|
|
47 |
metadatas.append(
|
48 |
{
|
49 |
"chunk_id": chunk.chunk_id,
|
50 |
+
"id_do_processo": chunk.id_do_processo,
|
51 |
"page": chunk.page_number,
|
52 |
"start_char": chunk.start_char,
|
53 |
"end_char": chunk.end_char,
|
gerar_documento/serializer.py
CHANGED
@@ -33,7 +33,7 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
|
|
33 |
embedding_weight = serializers.FloatField(default=0.5)
|
34 |
bm25_weight = serializers.FloatField(default=0.5)
|
35 |
context_window = serializers.IntegerField(default=3)
|
36 |
-
chunk_overlap = serializers.IntegerField(default=
|
37 |
num_k_rerank = serializers.IntegerField(default=20)
|
38 |
model_cohere_rerank = serializers.CharField(
|
39 |
required=False, default="rerank-english-v2.0"
|
@@ -61,7 +61,7 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
|
61 |
embedding_weight = serializers.FloatField(default=0.5)
|
62 |
bm25_weight = serializers.FloatField(default=0.5)
|
63 |
context_window = serializers.IntegerField(default=3)
|
64 |
-
chunk_overlap = serializers.IntegerField(default=
|
65 |
num_k_rerank = serializers.IntegerField(default=20)
|
66 |
model_cohere_rerank = serializers.CharField(
|
67 |
required=False, default="rerank-english-v2.0"
|
|
|
33 |
embedding_weight = serializers.FloatField(default=0.5)
|
34 |
bm25_weight = serializers.FloatField(default=0.5)
|
35 |
context_window = serializers.IntegerField(default=3)
|
36 |
+
chunk_overlap = serializers.IntegerField(default=1600)
|
37 |
num_k_rerank = serializers.IntegerField(default=20)
|
38 |
model_cohere_rerank = serializers.CharField(
|
39 |
required=False, default="rerank-english-v2.0"
|
|
|
61 |
embedding_weight = serializers.FloatField(default=0.5)
|
62 |
bm25_weight = serializers.FloatField(default=0.5)
|
63 |
context_window = serializers.IntegerField(default=3)
|
64 |
+
chunk_overlap = serializers.IntegerField(default=1600)
|
65 |
num_k_rerank = serializers.IntegerField(default=20)
|
66 |
model_cohere_rerank = serializers.CharField(
|
67 |
required=False, default="rerank-english-v2.0"
|