luanpoppe commited on
Commit
234f840
·
1 Parent(s): 55f46c1

feat: adicionando melhorias e correções do contextual retriever

Browse files
_antigos/resumos/serializer.py CHANGED
@@ -25,5 +25,5 @@ class ResumoCursorSerializer(serializers.Serializer):
25
  user_message = serializers.CharField(required=False, default="")
26
  model = serializers.CharField(required=False, default=default_model)
27
  hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
28
- chunk_size = serializers.IntegerField(required=False, default=1000)
29
- chunk_overlap = serializers.IntegerField(required=False, default=200)
 
25
  user_message = serializers.CharField(required=False, default="")
26
  model = serializers.CharField(required=False, default=default_model)
27
  hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
28
+ chunk_size = serializers.IntegerField(required=False, default=5000)
29
+ chunk_overlap = serializers.IntegerField(required=False, default=1600)
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED
@@ -174,7 +174,8 @@ class ContextualRetriever:
174
  ContextualizedChunk(
175
  content=chunk.content,
176
  page_number=chunk.page_number,
177
- chunk_id=result[index][0],
 
178
  start_char=chunk.start_char,
179
  end_char=chunk.end_char,
180
  context=" ".join(result[index][1:2]),
 
174
  ContextualizedChunk(
175
  content=chunk.content,
176
  page_number=chunk.page_number,
177
+ id_do_processo=result[index][0],
178
+ chunk_id=chunk.chunk_id,
179
  start_char=chunk.start_char,
180
  end_char=chunk.end_char,
181
  context=" ".join(result[index][1:2]),
_utils/models/gerar_relatorio.py CHANGED
@@ -10,6 +10,7 @@ class DocumentChunk:
10
  chunk_id: str
11
  start_char: int
12
  end_char: int
 
13
 
14
 
15
  @dataclass
 
10
  chunk_id: str
11
  start_char: int
12
  end_char: int
13
+ id_do_processo: int = 0
14
 
15
 
16
  @dataclass
_utils/vector_stores/Vector_store_class.py CHANGED
@@ -21,7 +21,7 @@ class VectorStore:
21
  # Prepare texts with context
22
  if is_contextualized_chunk:
23
  texts = [
24
- f"Document_id: {chunk.chunk_id}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
25
  for chunk in chunks
26
  ]
27
  else:
@@ -35,6 +35,7 @@ class VectorStore:
35
  metadatas.append(
36
  {
37
  "chunk_id": chunk.chunk_id,
 
38
  "page": chunk.page_number,
39
  "start_char": chunk.start_char,
40
  "end_char": chunk.end_char,
@@ -46,6 +47,7 @@ class VectorStore:
46
  metadatas.append(
47
  {
48
  "chunk_id": chunk.chunk_id,
 
49
  "page": chunk.page_number,
50
  "start_char": chunk.start_char,
51
  "end_char": chunk.end_char,
 
21
  # Prepare texts with context
22
  if is_contextualized_chunk:
23
  texts = [
24
+ f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
25
  for chunk in chunks
26
  ]
27
  else:
 
35
  metadatas.append(
36
  {
37
  "chunk_id": chunk.chunk_id,
38
+ "id_do_processo": chunk.id_do_processo,
39
  "page": chunk.page_number,
40
  "start_char": chunk.start_char,
41
  "end_char": chunk.end_char,
 
47
  metadatas.append(
48
  {
49
  "chunk_id": chunk.chunk_id,
50
+ "id_do_processo": chunk.id_do_processo,
51
  "page": chunk.page_number,
52
  "start_char": chunk.start_char,
53
  "end_char": chunk.end_char,
gerar_documento/serializer.py CHANGED
@@ -33,7 +33,7 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
33
  embedding_weight = serializers.FloatField(default=0.5)
34
  bm25_weight = serializers.FloatField(default=0.5)
35
  context_window = serializers.IntegerField(default=3)
36
- chunk_overlap = serializers.IntegerField(default=200)
37
  num_k_rerank = serializers.IntegerField(default=20)
38
  model_cohere_rerank = serializers.CharField(
39
  required=False, default="rerank-english-v2.0"
@@ -61,7 +61,7 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
61
  embedding_weight = serializers.FloatField(default=0.5)
62
  bm25_weight = serializers.FloatField(default=0.5)
63
  context_window = serializers.IntegerField(default=3)
64
- chunk_overlap = serializers.IntegerField(default=200)
65
  num_k_rerank = serializers.IntegerField(default=20)
66
  model_cohere_rerank = serializers.CharField(
67
  required=False, default="rerank-english-v2.0"
 
33
  embedding_weight = serializers.FloatField(default=0.5)
34
  bm25_weight = serializers.FloatField(default=0.5)
35
  context_window = serializers.IntegerField(default=3)
36
+ chunk_overlap = serializers.IntegerField(default=1600)
37
  num_k_rerank = serializers.IntegerField(default=20)
38
  model_cohere_rerank = serializers.CharField(
39
  required=False, default="rerank-english-v2.0"
 
61
  embedding_weight = serializers.FloatField(default=0.5)
62
  bm25_weight = serializers.FloatField(default=0.5)
63
  context_window = serializers.IntegerField(default=3)
64
+ chunk_overlap = serializers.IntegerField(default=1600)
65
  num_k_rerank = serializers.IntegerField(default=20)
66
  model_cohere_rerank = serializers.CharField(
67
  required=False, default="rerank-english-v2.0"