Spaces:
Sleeping
Sleeping
Upload 11 files
Browse files- .gitattributes +1 -0
- README.md +4 -4
- app.py +30 -9
- app_ui.py +107 -0
- constitution.pdf +3 -0
- embed_docs.py +32 -0
- js.py +45 -0
- load_document.py +12 -0
- requirements.txt +6 -5
- retrieve.py +7 -0
- split_document.py +13 -0
- theme.py +19 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
constitution.pdf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
|
|
1 |
---
|
2 |
+
title: The Law
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: pink
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
---
|
app.py
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
from langchain_core.prompts import PromptTemplate
|
2 |
from langchain.chains import create_retrieval_chain
|
3 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
4 |
-
|
5 |
-
|
6 |
import numpy as np
|
7 |
from langchain_ollama import OllamaLLM
|
8 |
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
from datetime import datetime
|
15 |
# from js import js
|
16 |
# from theme import theme
|
@@ -41,12 +41,33 @@ embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6
|
|
41 |
def fetch_doc():
|
42 |
# Adjust the path as needed, e.g., './' for current directory
|
43 |
pdf_files = glob.glob("Document/*.pdf")
|
|
|
|
|
|
|
|
|
44 |
return pdf_files
|
45 |
|
46 |
# # Define llm
|
47 |
hf_token = os.environ.get("HF_TOKEN").strip() # Ensure to set your Hugging Face token in the environment variable HF_TOKEN
|
48 |
-
llm =
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
pdf_files = fetch_doc() #Fetch Dataset
|
52 |
chunks = None
|
|
|
1 |
from langchain_core.prompts import PromptTemplate
|
2 |
from langchain.chains import create_retrieval_chain
|
3 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
4 |
+
import gradio as gr
|
5 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
6 |
import numpy as np
|
7 |
from langchain_ollama import OllamaLLM
|
8 |
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
+
from langchain_community.llms import HuggingFacePipeline
|
10 |
+
from load_document import load_data
|
11 |
+
from split_document import split_docs
|
12 |
+
from embed_docs import embed_docs
|
13 |
+
from retrieve import retrieve
|
14 |
from datetime import datetime
|
15 |
# from js import js
|
16 |
# from theme import theme
|
|
|
41 |
def fetch_doc():
|
42 |
# Adjust the path as needed, e.g., './' for current directory
|
43 |
pdf_files = glob.glob("Document/*.pdf")
|
44 |
+
|
45 |
+
# If you want to include subdirectories:
|
46 |
+
# pdf_files = glob.glob("**/*.pdf", recursive=True)
|
47 |
+
|
48 |
return pdf_files
|
49 |
|
50 |
# # Define llm
|
51 |
hf_token = os.environ.get("HF_TOKEN").strip() # Ensure to set your Hugging Face token in the environment variable HF_TOKEN
|
52 |
+
# #llm = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.3", device="cpu", use_auth_token=hf_token, token=hf_token)
|
53 |
+
# #llm = OllamaLLM(model="mistral:7b-instruct", base_url="http://host.docker.internal:11434")
|
54 |
+
model_id = "google/gemma-2b-it"
|
55 |
+
|
56 |
+
# # Load tokenizer and model
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
|
58 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", torch_dtype="auto", token=hf_token)
|
59 |
+
|
60 |
+
# # Create text generation pipeline
|
61 |
+
hf_pipe = pipeline(
|
62 |
+
"text-generation",
|
63 |
+
model=model,
|
64 |
+
tokenizer=tokenizer,
|
65 |
+
max_new_tokens=512,
|
66 |
+
temperature=0.7,
|
67 |
+
top_p=0.9,
|
68 |
+
do_sample=True
|
69 |
+
)
|
70 |
+
llm = HuggingFacePipeline(pipeline=hf_pipe)
|
71 |
|
72 |
pdf_files = fetch_doc() #Fetch Dataset
|
73 |
chunks = None
|
app_ui.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.prompts import PromptTemplate
|
2 |
+
from langchain.chains import create_retrieval_chain
|
3 |
+
from langchain.chains.combine_documents import create_stuff_documents_chain
|
4 |
+
import gradio as gr
|
5 |
+
from transformers import pipeline
|
6 |
+
import numpy as np
|
7 |
+
from langchain_ollama import OllamaLLM
|
8 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
9 |
+
from load_document import load_data
|
10 |
+
from split_document import split_docs
|
11 |
+
from embed_docs import embed_docs
|
12 |
+
from retrieve import retrieve
|
13 |
+
from datetime import datetime
|
14 |
+
from js import js
|
15 |
+
from theme import theme
|
16 |
+
import os
|
17 |
+
import glob
|
18 |
+
import requests
|
19 |
+
|
20 |
+
# Initialize our speech pipeline
|
21 |
+
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device="cpu")
|
22 |
+
pretext = ""
|
23 |
+
|
24 |
+
def send_to_chat(question: str, history=None):
|
25 |
+
payload = {
|
26 |
+
"question": question
|
27 |
+
}
|
28 |
+
response = requests.post("https://sadiksmart0-the-law.hf.space/query", json=payload)
|
29 |
+
if response.status_code != 200:
|
30 |
+
print(f"Error {response.status_code}:")
|
31 |
+
return f"Error {response.status_code}: Okay Unable to fetch response from server."
|
32 |
+
return response.json().get("answer", "No answer returned.")
|
33 |
+
pretext = ""
|
34 |
+
def transcribe(audio):
|
35 |
+
global pretext
|
36 |
+
if audio is None:
|
37 |
+
return "Please record again. Loud and clear audio is required."
|
38 |
+
sr, y = audio
|
39 |
+
|
40 |
+
|
41 |
+
# Convert to mono if stereo
|
42 |
+
if y.ndim > 1:
|
43 |
+
y = y.mean(axis=1)
|
44 |
+
y = y.astype(np.float32)
|
45 |
+
y /= np.max(np.abs(y))
|
46 |
+
|
47 |
+
pretext = transcriber({"sampling_rate": sr, "raw": y})["text"]
|
48 |
+
return pretext
|
49 |
+
|
50 |
+
with gr.Blocks(title="Know The Law", theme=theme, js=js) as demo:
|
51 |
+
|
52 |
+
with gr.Row():
|
53 |
+
with gr.Column():
|
54 |
+
gr.Markdown("# Know The Law")
|
55 |
+
|
56 |
+
audio_input = gr.Audio(
|
57 |
+
label="Input Audio",
|
58 |
+
sources=["microphone"],
|
59 |
+
type="numpy",
|
60 |
+
container=True,
|
61 |
+
interactive=True,
|
62 |
+
waveform_options=gr.WaveformOptions(waveform_color="#B83A4B"),
|
63 |
+
)
|
64 |
+
|
65 |
+
output_text = gr.Textbox(
|
66 |
+
interactive=True,
|
67 |
+
submit_btn=True, # Enables submit event
|
68 |
+
label="Transcription Output",
|
69 |
+
visible=False # Made it invincible
|
70 |
+
)
|
71 |
+
|
72 |
+
audio_input.change(transcribe, inputs=[audio_input], outputs=[output_text])
|
73 |
+
|
74 |
+
gr.Markdown("# What does the Law say?")
|
75 |
+
|
76 |
+
chat = gr.ChatInterface(
|
77 |
+
send_to_chat,
|
78 |
+
chatbot=gr.Chatbot(height=300, type="messages"),
|
79 |
+
textbox=gr.Textbox(
|
80 |
+
placeholder="Ask me a question related to Nigerian law",
|
81 |
+
container=True,
|
82 |
+
scale=7,
|
83 |
+
submit_btn=True,
|
84 |
+
type="text"
|
85 |
+
),
|
86 |
+
type="messages",
|
87 |
+
examples=[
|
88 |
+
"How can I file a complaint against police misconduct?",
|
89 |
+
"What is the process for obtaining a court order?",
|
90 |
+
"What are the legal requirements for starting a business in Nigeria?"
|
91 |
+
],
|
92 |
+
title="Law Assistant",
|
93 |
+
run_examples_on_click=True,
|
94 |
+
save_history=True,
|
95 |
+
cache_examples=True
|
96 |
+
)
|
97 |
+
|
98 |
+
chat_input = chat.textbox # Get chatbot's text input
|
99 |
+
|
100 |
+
# Autofill chatbot input box with transcribed text
|
101 |
+
output_text.change(lambda x: x, inputs=[output_text], outputs=[chat_input])
|
102 |
+
|
103 |
+
# Submit event: When user presses Enter on output_text, it submits to chat
|
104 |
+
output_text.submit(send_to_chat, inputs=[output_text, chat.chatbot], outputs=chat.chatbot)
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
demo.launch(share=True, server_port=8001)
|
constitution.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84ee8e8a0fec7592a6176795762c35ddc93d9a12bed5799d3968c06e69e6ddcb
|
3 |
+
size 775326
|
embed_docs.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from langchain_community.vectorstores import FAISS
|
3 |
+
import os
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
vector_store_path = "/home/user/VectorStoreDB"
|
7 |
+
index_name = "faiss_index"
|
8 |
+
full_index_path = os.path.join(vector_store_path, index_name)
|
9 |
+
start = ""
|
10 |
+
end = ""
|
11 |
+
|
12 |
+
|
13 |
+
def embed_docs(documents, embedder):
|
14 |
+
|
15 |
+
# Ensure the directory exists
|
16 |
+
os.makedirs(vector_store_path, exist_ok=True)
|
17 |
+
|
18 |
+
# just query if it exists
|
19 |
+
if os.path.exists(full_index_path):
|
20 |
+
print(f"Loading existing vector store at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
21 |
+
saved_vector = FAISS.load_local(full_index_path,
|
22 |
+
embeddings=embedder,
|
23 |
+
allow_dangerous_deserialization=True)
|
24 |
+
|
25 |
+
return saved_vector
|
26 |
+
else:
|
27 |
+
print(f"Embedding documents at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
28 |
+
embedded_vector = FAISS.from_documents(documents=documents, embedding=embedder)
|
29 |
+
embedded_vector.save_local(full_index_path)
|
30 |
+
print(f"Vector store saved at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
31 |
+
|
32 |
+
return embedded_vector
|
js.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
js = """
|
2 |
+
async function main() {
|
3 |
+
const script1 = document.createElement("script");
|
4 |
+
script1.src = "https://cdn.jsdelivr.net/npm/[email protected]/dist/ort.js";
|
5 |
+
document.head.appendChild(script1)
|
6 |
+
const script2 = document.createElement("script");
|
7 |
+
script2.onload = async () => {
|
8 |
+
console.log("vad loaded") ;
|
9 |
+
var record = document.querySelector('.record-button');
|
10 |
+
record.textContent = "Just Start Talking!"
|
11 |
+
record.style = "width: fit-content; padding-right: 0.5vw;"
|
12 |
+
const myvad = await vad.MicVAD.new({
|
13 |
+
onSpeechStart: () => {
|
14 |
+
var record = document.querySelector('.record-button');
|
15 |
+
var player = document.querySelector('#streaming-out')
|
16 |
+
if (record != null && (player == null || player.paused)) {
|
17 |
+
console.log(record);
|
18 |
+
record.click();
|
19 |
+
}
|
20 |
+
},
|
21 |
+
onSpeechEnd: (audio) => {
|
22 |
+
var stop = document.querySelector('.stop-button');
|
23 |
+
if (stop != null) {
|
24 |
+
console.log(stop);
|
25 |
+
stop.click();
|
26 |
+
}
|
27 |
+
}
|
28 |
+
})
|
29 |
+
myvad.start()
|
30 |
+
}
|
31 |
+
script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/[email protected]/dist/bundle.min.js";
|
32 |
+
script1.onload = () => {
|
33 |
+
console.log("onnx loaded")
|
34 |
+
document.head.appendChild(script2)
|
35 |
+
};
|
36 |
+
}
|
37 |
+
"""
|
38 |
+
|
39 |
+
js_reset = """
|
40 |
+
() => {
|
41 |
+
var record = document.querySelector('.record-button');
|
42 |
+
record.textContent = "Just Start Talking!"
|
43 |
+
record.style = "width: fit-content; padding-right: 0.5vw;"
|
44 |
+
}
|
45 |
+
"""
|
load_document.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from langchain_community.document_loaders import PDFPlumberLoader
|
4 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
5 |
+
|
6 |
+
|
7 |
+
# Load the PDF
|
8 |
+
def load_data(document):
|
9 |
+
loader = PDFPlumberLoader(document)
|
10 |
+
docs = loader.load()
|
11 |
+
|
12 |
+
return docs
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
fastapi
|
2 |
gradio==5.31.0
|
3 |
langchain==0.3.25
|
4 |
langchain_community==0.3.24
|
@@ -8,16 +8,17 @@ langchain_huggingface==0.2.0
|
|
8 |
langchain_ollama==0.3.3
|
9 |
numpy==2.2.6
|
10 |
Requests==2.32.3
|
11 |
-
|
12 |
-
|
13 |
uvicorn==0.34.2
|
14 |
torch
|
15 |
torchvision
|
16 |
-
|
17 |
pdfplumber
|
18 |
faiss-cpu
|
19 |
numpy
|
20 |
pydantic
|
21 |
protobuf
|
22 |
sentencepiece>=0.1.99
|
23 |
-
accelerate>=0.26.0
|
|
|
|
1 |
+
fastapi[standard]==0.115.12
|
2 |
gradio==5.31.0
|
3 |
langchain==0.3.25
|
4 |
langchain_community==0.3.24
|
|
|
8 |
langchain_ollama==0.3.3
|
9 |
numpy==2.2.6
|
10 |
Requests==2.32.3
|
11 |
+
selenium==4.32.0
|
12 |
+
transformers==4.46.2
|
13 |
uvicorn==0.34.2
|
14 |
torch
|
15 |
torchvision
|
16 |
+
huggingface_hub[hf_xet]
|
17 |
pdfplumber
|
18 |
faiss-cpu
|
19 |
numpy
|
20 |
pydantic
|
21 |
protobuf
|
22 |
sentencepiece>=0.1.99
|
23 |
+
accelerate>=0.26.0
|
24 |
+
pymupdf
|
retrieve.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def retrieve(saved_vector):
|
2 |
+
print("Retrieving similar documents...")
|
3 |
+
retriever = saved_vector.as_retriever(
|
4 |
+
search_type="similarity",
|
5 |
+
search_kwargs={"k": 3}
|
6 |
+
)
|
7 |
+
return retriever
|
split_document.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_experimental.text_splitter import SemanticChunker
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
def split_docs(docs, embedder):
|
7 |
+
|
8 |
+
# Split into chunks using the SemanticChunker with the embedder'
|
9 |
+
print("Splitting documents into chunks...")
|
10 |
+
text_splitter = SemanticChunker(embeddings=embedder)
|
11 |
+
documents = text_splitter.split_documents(docs)
|
12 |
+
|
13 |
+
return documents
|
theme.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
theme = gr.themes.Soft(
|
4 |
+
primary_hue=gr.themes.Color(
|
5 |
+
c100="#82000019",
|
6 |
+
c200="#82000033",
|
7 |
+
c300="#8200004c",
|
8 |
+
c400="#82000066",
|
9 |
+
c50="#8200007f",
|
10 |
+
c500="#8200007f",
|
11 |
+
c600="#82000099",
|
12 |
+
c700="#820000b2",
|
13 |
+
c800="#820000cc",
|
14 |
+
c900="#820000e5",
|
15 |
+
c950="#820000f2",
|
16 |
+
),
|
17 |
+
secondary_hue="rose",
|
18 |
+
neutral_hue="stone",
|
19 |
+
)
|