luanpoppe commited on
Commit
e725020
·
1 Parent(s): 3143cff

feat: separando as etapas de gerar relatório e gerar resumo

Browse files
_utils/resumo_completo_cursor.py CHANGED
@@ -317,15 +317,16 @@ class ContextualRetriever:
317
  return contextualized_chunks
318
 
319
  class EnhancedDocumentSummarizer(DocumentSummarizer):
320
- def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, system_prompt, gpt_model, gpt_temperature, id_modelo_do_usuario):
321
  super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
322
  self.config = config
323
  self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
324
  self.logger = logging.getLogger(__name__)
325
- self.system_prompt = system_prompt
326
  self.gpt_model = gpt_model
327
  self.gpt_temperature = gpt_temperature
328
  self.id_modelo_do_usuario = id_modelo_do_usuario
 
329
 
330
  def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
331
  """Create vector store and BM25 index with contextualized chunks"""
@@ -453,8 +454,6 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
453
  'relevance_score': score,
454
  'context': metadata.get('context', '')
455
  })
456
-
457
- prompt_template = self.system_prompt
458
 
459
  url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
460
  resposta = requests.get(url_request)
@@ -464,11 +463,6 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
464
 
465
  modelo_buscado = resposta.json()["modelo"]
466
 
467
- prompt = PromptTemplate(
468
- template=prompt_template,
469
- input_variables=["context", "modelo_usuario"]
470
- )
471
-
472
  llm = ChatOpenAI(
473
  temperature=self.gpt_temperature,
474
  model_name=self.gpt_model,
@@ -476,10 +470,22 @@ class EnhancedDocumentSummarizer(DocumentSummarizer):
476
 
477
  )
478
 
479
- response = llm.predict(prompt.format(context="\n\n".join(contexts), modelo_usuario=modelo_buscado))
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  # Split the response into paragraphs
482
- summaries = [p.strip() for p in response.split('\n\n') if p.strip()]
483
 
484
  # Create structured output
485
  structured_output = []
@@ -525,10 +531,11 @@ def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs):
525
  num_k_rerank=serializer["num_k_rerank"],
526
  model_cohere_rerank=serializer["model_cohere_rerank"],
527
  claude_context_model=serializer["claude_context_model"],
528
- system_prompt=serializer["system_prompt"],
529
  gpt_model=serializer["model"],
530
  gpt_temperature=serializer["gpt_temperature"],
531
- id_modelo_do_usuario=serializer["id_modelo_do_usuario"]
 
532
  )
533
 
534
  # # Load and process document
@@ -582,5 +589,6 @@ def get_llm_summary_answer_by_cursor_complete(serializer, listaPDFs):
582
  "hf_embedding": serializer["hf_embedding"],
583
  "chunk_size": serializer["chunk_size"],
584
  "chunk_overlap": serializer["chunk_overlap"],
585
- "system_prompt": serializer["system_prompt"],
 
586
  }}
 
317
  return contextualized_chunks
318
 
319
  class EnhancedDocumentSummarizer(DocumentSummarizer):
320
+ def __init__(self, openai_api_key: str, claude_api_key: str, config: RetrievalConfig, embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank, claude_context_model, prompt_relatorio, gpt_model, gpt_temperature, id_modelo_do_usuario, prompt_modelo):
321
  super().__init__(openai_api_key, os.environ.get("COHERE_API_KEY"), embedding_model, chunk_size, chunk_overlap, num_k_rerank, model_cohere_rerank)
322
  self.config = config
323
  self.contextual_retriever = ContextualRetriever(config, claude_api_key, claude_context_model)
324
  self.logger = logging.getLogger(__name__)
325
+ self.prompt_relatorio = prompt_relatorio
326
  self.gpt_model = gpt_model
327
  self.gpt_temperature = gpt_temperature
328
  self.id_modelo_do_usuario = id_modelo_do_usuario
329
+ self.prompt_modelo = prompt_modelo
330
 
331
  def create_enhanced_vector_store(self, chunks: List[ContextualizedChunk]) -> Tuple[Chroma, BM25Okapi, List[str]]:
332
  """Create vector store and BM25 index with contextualized chunks"""
 
454
  'relevance_score': score,
455
  'context': metadata.get('context', '')
456
  })
 
 
457
 
458
  url_request = f"{api_url}/modelo/{self.id_modelo_do_usuario}"
459
  resposta = requests.get(url_request)
 
463
 
464
  modelo_buscado = resposta.json()["modelo"]
465
 
 
 
 
 
 
466
  llm = ChatOpenAI(
467
  temperature=self.gpt_temperature,
468
  model_name=self.gpt_model,
 
470
 
471
  )
472
 
473
+ prompt_gerar_relatorio = PromptTemplate(
474
+ template=self.prompt_relatorio,
475
+ input_variables=["context"]
476
+ )
477
+
478
+ relatorio_gerado = llm.predict(prompt_gerar_relatorio.format(context="\n\n".join(contexts)))
479
+
480
+ prompt_gerar_modelo = PromptTemplate(
481
+ template=self.prompt_modelo,
482
+ input_variables=["context", "modelo_usuario"]
483
+ )
484
+
485
+ modelo_gerado = llm.predict(prompt_gerar_modelo.format(context=relatorio_gerado, modelo_usuario=modelo_buscado))
486
 
487
  # Split the response into paragraphs
488
+ summaries = [p.strip() for p in modelo_gerado.split('\n\n') if p.strip()]
489
 
490
  # Create structured output
491
  structured_output = []
 
531
  num_k_rerank=serializer["num_k_rerank"],
532
  model_cohere_rerank=serializer["model_cohere_rerank"],
533
  claude_context_model=serializer["claude_context_model"],
534
+ prompt_relatorio=serializer["prompt_relatorio"],
535
  gpt_model=serializer["model"],
536
  gpt_temperature=serializer["gpt_temperature"],
537
+ id_modelo_do_usuario=serializer["id_modelo_do_usuario"],
538
+ prompt_modelo=serializer["prompt_modelo"]
539
  )
540
 
541
  # # Load and process document
 
589
  "hf_embedding": serializer["hf_embedding"],
590
  "chunk_size": serializer["chunk_size"],
591
  "chunk_overlap": serializer["chunk_overlap"],
592
+ "prompt_relatorio": serializer["prompt_relatorio"],
593
+ "prompt_modelo": serializer["prompt_modelo"]
594
  }}
resumos/serializer.py CHANGED
@@ -29,7 +29,20 @@ class ResumoCursorSerializer(serializers.Serializer):
29
  chunk_overlap = serializers.IntegerField(required=False, default=200)
30
 
31
 
32
- system_prompt = """
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  Based on the following context, provide multiple key points from the document.
34
  For each point, create a new paragraph.
35
  Each paragraph should be a complete, self-contained insight.
@@ -41,9 +54,10 @@ system_prompt = """
41
 
42
  Key points:
43
  """
44
- user_message = "What are the main points of this document?"
45
  class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
46
- system_prompt = serializers.CharField(required=False, default=system_prompt)
 
 
47
  user_message = serializers.CharField(required=False, default=user_message)
48
  num_chunks_retrieval = serializers.IntegerField(default=5)
49
  embedding_weight = serializers.FloatField(default=0.5)
 
29
  chunk_overlap = serializers.IntegerField(required=False, default=200)
30
 
31
 
32
+ system_prompt_relatorio = """
33
+ Based on the following context, provide multiple key points from the document.
34
+ For each point, create a new paragraph.
35
+ Each paragraph should be a complete, self-contained insight.
36
+ Include any relevant context provided.
37
+
38
+ Context: {context}
39
+
40
+ Key points:
41
+ """
42
+
43
+ user_message = "What are the main points of this document?"
44
+
45
+ system_prompt_modelo = """
46
  Based on the following context, provide multiple key points from the document.
47
  For each point, create a new paragraph.
48
  Each paragraph should be a complete, self-contained insight.
 
54
 
55
  Key points:
56
  """
 
57
  class ResumoCursorCompeltoSerializer(ResumoCursorSerializer):
58
+ system_prompt = None
59
+ prompt_relatorio = serializers.CharField(required=False, default=system_prompt_relatorio)
60
+ prompt_modelo = serializers.CharField(required=False, default=system_prompt_modelo)
61
  user_message = serializers.CharField(required=False, default=user_message)
62
  num_chunks_retrieval = serializers.IntegerField(default=5)
63
  embedding_weight = serializers.FloatField(default=0.5)