File size: 8,252 Bytes
e63103b
 
 
 
 
 
 
3f199c2
68d3cc8
0870c96
3251505
7f5942d
3251505
7dc6d22
 
 
a1c6081
e63103b
16867c3
 
4d3bceb
16867c3
e63103b
a37a365
16867c3
 
 
e63103b
0870c96
 
16867c3
b700f35
16867c3
 
 
 
 
 
 
 
 
 
 
 
 
 
e63103b
3f199c2
16867c3
 
0870c96
16867c3
 
 
 
 
 
 
 
e63103b
 
1a93363
16867c3
 
 
 
 
 
 
 
 
 
 
 
4d3bceb
 
16867c3
 
 
 
 
 
 
 
 
 
4dcf767
 
3251505
 
 
 
 
 
 
16867c3
7dc6d22
16867c3
 
 
 
 
 
 
7dc6d22
 
 
16867c3
 
 
 
 
 
 
 
 
 
 
 
7dc6d22
 
 
16867c3
 
 
3251505
7dc6d22
16867c3
 
 
 
 
 
 
 
 
 
 
3251505
4dcf767
 
 
 
 
 
3251505
4dcf767
 
 
 
 
 
 
 
 
 
 
20e8064
4dcf767
20e8064
bf45279
 
 
 
20e8064
bf45279
7dc6d22
bf45279
20e8064
 
bf45279
20e8064
 
bf45279
7dc6d22
20e8064
 
bf45279
7dc6d22
bf45279
 
 
 
 
d6bac9a
834da99
 
bf45279
 
 
3251505
4dcf767
 
 
 
 
 
 
20e8064
834da99
 
20e8064
834da99
7f5942d
a1c6081
 
 
 
 
 
 
 
2776b52
 
 
 
16867c3
 
 
 
2776b52
 
b287766
a1c6081
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
from langchain_community.document_loaders import PyPDFLoader
import os
from langchain_openai import ChatOpenAI
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
from setup.environment import default_model
from uuid import uuid4
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field
from typing import List
import numpy as np
import openai
import pandas as pd
import markdown

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "VELLA"
os.environ.get("OPENAI_API_KEY")
os.environ.get("HUGGINGFACEHUB_API_TOKEN")
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

allIds = []


def getPDF(file_paths):
    documentId = 0
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    pages = []
    for file in file_paths:
        loader = PyPDFLoader(file, extract_images=False)
        pagesDoc = loader.load_and_split(text_splitter)
        pages = pages + pagesDoc

    for page in pages:
        documentId = str(uuid4())
        allIds.append(documentId)
        page.id = documentId
    return pages


def create_retriever(documents, vectorstore):
    print("\n\n")
    print("documents: ", documents[:2])

    vectorstore.add_documents(documents=documents)

    retriever = vectorstore.as_retriever(
        # search_type="similarity",
        # search_kwargs={"k": 3},
    )

    return retriever


def create_prompt_llm_chain(system_prompt, modelParam):
    model = create_llm(modelParam)

    system_prompt = system_prompt + "\n\n" + "{context}"
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(model, prompt)
    return question_answer_chain


def create_llm(modelParam):
    if modelParam == default_model:
        return ChatOpenAI(model=modelParam, max_tokens=16384)
    else:
        return HuggingFaceEndpoint(
            repo_id=modelParam,
            task="text-generation",
            max_new_tokens=1100,
            do_sample=False,
            huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"),
        )


class Resumo(BaseModel):
    nome_do_memorial: str = Field()
    argumentos: str = Field()
    jurisprudencia: str = Field()
    doutrina: str = Field()
    palavras_chave: List[str] = Field()


def create_prompt_llm_chain_summary(system_prompt, model_param):
    prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)

    question_answer_chain = create_stuff_documents_chain(
        prompt_and_llm["model"], prompt_and_llm["prompt"]
    )
    final_chain = question_answer_chain | JsonOutputParser(pydantic_object=Resumo)
    return final_chain


def process_embedding_summary(system_prompt, model_param, full_text):
    prompt_and_llm = create_prompt_and_llm(system_prompt, model_param)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    docs = text_splitter.create_documents([full_text])
    embeddings = get_embeddings([doc.page_content for doc in docs])

    content_list = [doc.page_content for doc in docs]
    df = pd.DataFrame(content_list, columns=["page_content"])
    vectors = [embedding.embedding for embedding in embeddings]
    array = np.array(vectors)
    embeddings_series = pd.Series(list(array))
    df["embeddings"] = embeddings_series


def get_embeddings(text):
    response = openai.embeddings.create(model="text-embedding-3-small", input=text)
    return response.data


def create_prompt_and_llm(system_prompt, model_param):
    model = create_llm(model_param)

    system_prompt = system_prompt + "\n\n" + "{context}"
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )
    return {"model": model, "prompt": prompt}


DEFAULT_SYSTEM_PROMPT = """

You are a highly knowledgeable legal assistant specializing in case summarization. Your task is to provide comprehensive and accurate summaries of legal cases while maintaining a professional and objective demeanor. Always approach each case with careful consideration and analytical rigor.

First, you will be given a document to analyze:

Next, you will summarize a content provided.

Before providing your summary, follow these steps:

1. Argumentation Mining: Conduct a cross-Document Argument Analysis to identify the main arguments, claims, and supporting evidence within the document. Focus on extracting the most relevant information related to the summary request.

2. Socratic Questioning: Reflect on your initial findings using the Socratic method. Ask yourself probing questions to challenge your assumptions and deepen your understanding of the document's content. For example:
 - What are the key points I've identified?
 - Are there any counterarguments or alternative perspectives I've overlooked?
 - How does this information relate to the specific summary request?
 - What additional context might be necessary to fully understand these points?

3. Maximal Marginal Relevance: Apply the principles of Maximal Marginal Relevance to ensure your summary includes diverse, relevant information while avoiding redundancy. Prioritize information that is both relevant to the summary request and adds new insights not already covered.

After completing these steps, generate the response with around 10000 characteres in BBcode format, as shown below: 

Example: :

{{
  "nome_do_memorial": "[Insira aqui o nome do memorial e número da equipe] ",
  
  "argumentos": "
  [b]Argumento 1:[/b] 
  Fundamento 1.1: [Descreva o fundamento de forma extensa e completa] 
  Fundamento 1.2: [Descreva o fundamento de forma extensa e completa] 
  [b]Argumento 2:[/b] 
  Fundamento 2.1: [Descreva o fundamento de forma extensa e completa] 
  Fundamento 2.2: [Descreva o fundamento de forma extensa e completa]",
  
  "jurisprudencia": "
  [b]Caso 1:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição extensa de como a jurisprudência se aplica] 
  [b]Caso 2:[/b] [Nome e referência do caso] [i]Resumo:[/i] [Descrição extensa de como a jurisprudência se aplica]",
  
  "doutrina": "
  [b]Autor 1:[/b] [Nome do autor] 
  "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor] 
  [b]Autor 2:[/b] [Nome do autor] 
  "[Título da obra]" [i]Resumo:[/i] [Resumo da posição do autor]",
  
  "palavras-chave": "
  [Palavra-chave 1] 
  [Palavra-chave 2] 
  [Palavra-chave 3] 
  [Adicione outras palavras relevantes]"
}}


Remember:
- Always prioritize relevance to the summary request.
- Ensure your summary is well-structured and easy to understand.
- Do not include any personal opinions or information not present in the original document.
- If the summary request asks for a specific focus or perspective, make sure to address it directly.

Your goal is to provide a comprehensive yet concise summary that accurately represents the document's content while meeting the specific needs outlined in the summary request.

Do not pass in the response part of the instructions that you received
Generate the response with at least 10000 characteres
The content to be summarized is as follows:
"""


def convert_markdown_to_HTML(text: str):
    texto_inicial = (
        text.replace("```", "")
        .replace("<diagnostico_processual>", "")
        .replace("</diagnostico_processual>", "")
        .replace("xml", "")
    )

    html = (
        markdown.markdown(texto_inicial)
        .replace("<li>\n", "<li>")
        .replace("<ol>\n<li>", "<ol><li>")
        .replace("</li>\n</ol>", "</li></ol>")
        .replace("</li>\n<li>", "</li><li>")
        .replace("<ul>\n<li>", "<ul><li>")
        .replace("</li>\n</ul>", "</li></ul>")
        .replace("\n", "\n\n")
    )
    return html