File size: 4,633 Bytes
0bb2da6
 
 
74d8f71
 
0bb2da6
 
 
74d8f71
 
 
 
 
0bb2da6
 
 
 
 
 
f3a346f
0bb2da6
f3a346f
 
0bb2da6
f3a346f
0bb2da6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74d8f71
 
 
 
0bb2da6
 
 
 
74d8f71
 
 
 
 
 
 
0bb2da6
74d8f71
 
 
 
 
 
 
 
 
 
 
0bb2da6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3a346f
 
0bb2da6
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from langchain_core.prompts import PromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import numpy as np
from langchain_ollama import OllamaLLM
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from load_document import load_data
from split_document import split_docs
from embed_docs import embed_docs
from retrieve import retrieve
from datetime import datetime
# from js import js
# from theme import theme
import os
import glob
from fastapi import FastAPI, Query, Request
from pydantic import BaseModel
import uvicorn


app = FastAPI(title="Know The Law", description="A FastAPI application for legal assistance using AI.")



vector_store_path = "/home/user/VectorStoreDB"
index_name = "faiss_index"
full_index_path = os.path.join(vector_store_path, index_name)

# # Create the embedder with a specific model
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# # Initialize our speech pipeline
# transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device="cpu")




def fetch_doc():
    # Adjust the path as needed, e.g., './' for current directory
    pdf_files = glob.glob("Document/*.pdf")

    # If you want to include subdirectories:
    # pdf_files = glob.glob("**/*.pdf", recursive=True)

    return pdf_files

# # Define llm
hf_token = os.environ.get("HF_TOKEN").strip()  # Ensure to set your Hugging Face token in the environment variable HF_TOKEN
# #llm = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.3", device="cpu", use_auth_token=hf_token, token=hf_token)
# #llm = OllamaLLM(model="mistral:7b-instruct", base_url="http://host.docker.internal:11434")
model_id = "google/gemma-2b-it"

# # Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", torch_dtype="auto", token=hf_token)

# # Create text generation pipeline
hf_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)
llm = HuggingFacePipeline(pipeline=hf_pipe)

pdf_files = fetch_doc() #Fetch Dataset
chunks = None
loaded_docs = []
# just query if it exists
if not os.path.exists(full_index_path):
    for doc in pdf_files:
        print(f"Loading.....{doc}")
        docs = load_data(doc) #Load Dataset
        loaded_docs.append(docs)
    final_docs = [item for sublist in loaded_docs for item in sublist] # Flatten the list    
    chunks = split_docs(final_docs, embedder=embedder) #Split Document
saved_vector = embed_docs(chunks, embedder=embedder) #Embed Document
retrieved = retrieve(saved_vector) # Retrieve simimlar docs

# Define the prompt template
prompt = """
You are The Law Assistant, an AI trained to help Nigerians understand their legal rights and obligations. Using the provided context below, answer user questions related to Nigerian law.

Instructions:

1. Base your responses strictly on the given context or verified legal sources.

2. If the answer is not in the context and you're unsure, respond with: "I don't know based on the available information." Do not fabricate or speculate.

3. Keep your answers clear, concise, and jargon-free.

4. Always cite the legal source(s) or reference(s) you used (e.g., constitution section, legal act, court ruling).

Context: {context}
Question: {{question}}

Helpful Answer:"""


QA_CHAIN_PROMPT = PromptTemplate.from_template(template=prompt)

# Create document prompt
document_prompt = PromptTemplate(
    input_variables=["page_content", "source"],
    template="Context:\ncontent:{page_content}\nsource:{source}",
)

# Create the stuff documents chain
combine_docs_chain = create_stuff_documents_chain(
    llm,
    QA_CHAIN_PROMPT,
    document_prompt=document_prompt
)

# Create the retrieval chain
qa_chain = create_retrieval_chain(
    retriever=retrieved,
    combine_docs_chain=combine_docs_chain
)

class QueryRequest(BaseModel):
    question: str

@app.get("/")
def home():
    return {"message": "Welcome to the Know The Law API. Use POST /query to ask legal questions."}

@app.post("/query")
def respond(query: QueryRequest):
    # Invoke the chain with the question
    question = query.question
    result = qa_chain.invoke({"input":question})
    
    # Return the answer
    return {"answer": result['answer']}