Sadiksmart0 commited on
Commit
74d8f71
·
verified ·
1 Parent(s): 1d3516c

Upload 11 files

Browse files
Files changed (12) hide show
  1. .gitattributes +1 -0
  2. README.md +4 -4
  3. app.py +30 -9
  4. app_ui.py +107 -0
  5. constitution.pdf +3 -0
  6. embed_docs.py +32 -0
  7. js.py +45 -0
  8. load_document.py +12 -0
  9. requirements.txt +6 -5
  10. retrieve.py +7 -0
  11. split_document.py +13 -0
  12. theme.py +19 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ constitution.pdf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: DeLaw Ollama
3
- emoji: 👀
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: docker
7
  pinned: false
8
  ---
 
1
  ---
2
+ title: The Law
3
+ emoji: 🦀
4
+ colorFrom: purple
5
+ colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  ---
app.py CHANGED
@@ -1,16 +1,16 @@
1
  from langchain_core.prompts import PromptTemplate
2
  from langchain.chains import create_retrieval_chain
3
  from langchain.chains.combine_documents import create_stuff_documents_chain
4
- # import gradio as gr
5
- # from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
6
  import numpy as np
7
  from langchain_ollama import OllamaLLM
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
- # from langchain_community.llms import HuggingFacePipeline
10
- # from load_document import load_data
11
- # from split_document import split_docs
12
- # from embed_docs import embed_docs
13
- # from retrieve import retrieve
14
  from datetime import datetime
15
  # from js import js
16
  # from theme import theme
@@ -41,12 +41,33 @@ embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6
41
  def fetch_doc():
42
  # Adjust the path as needed, e.g., './' for current directory
43
  pdf_files = glob.glob("Document/*.pdf")
 
 
 
 
44
  return pdf_files
45
 
46
  # # Define llm
47
  hf_token = os.environ.get("HF_TOKEN").strip() # Ensure to set your Hugging Face token in the environment variable HF_TOKEN
48
- llm = OllamaLLM(model="mistral:7b-instruct")
49
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  pdf_files = fetch_doc() #Fetch Dataset
52
  chunks = None
 
1
  from langchain_core.prompts import PromptTemplate
2
  from langchain.chains import create_retrieval_chain
3
  from langchain.chains.combine_documents import create_stuff_documents_chain
4
+ import gradio as gr
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
6
  import numpy as np
7
  from langchain_ollama import OllamaLLM
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain_community.llms import HuggingFacePipeline
10
+ from load_document import load_data
11
+ from split_document import split_docs
12
+ from embed_docs import embed_docs
13
+ from retrieve import retrieve
14
  from datetime import datetime
15
  # from js import js
16
  # from theme import theme
 
41
  def fetch_doc():
42
  # Adjust the path as needed, e.g., './' for current directory
43
  pdf_files = glob.glob("Document/*.pdf")
44
+
45
+ # If you want to include subdirectories:
46
+ # pdf_files = glob.glob("**/*.pdf", recursive=True)
47
+
48
  return pdf_files
49
 
50
  # # Define llm
51
  hf_token = os.environ.get("HF_TOKEN").strip() # Ensure to set your Hugging Face token in the environment variable HF_TOKEN
52
+ # #llm = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.3", device="cpu", use_auth_token=hf_token, token=hf_token)
53
+ # #llm = OllamaLLM(model="mistral:7b-instruct", base_url="http://host.docker.internal:11434")
54
+ model_id = "google/gemma-2b-it"
55
+
56
+ # # Load tokenizer and model
57
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
58
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", torch_dtype="auto", token=hf_token)
59
+
60
+ # # Create text generation pipeline
61
+ hf_pipe = pipeline(
62
+ "text-generation",
63
+ model=model,
64
+ tokenizer=tokenizer,
65
+ max_new_tokens=512,
66
+ temperature=0.7,
67
+ top_p=0.9,
68
+ do_sample=True
69
+ )
70
+ llm = HuggingFacePipeline(pipeline=hf_pipe)
71
 
72
  pdf_files = fetch_doc() #Fetch Dataset
73
  chunks = None
app_ui.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import PromptTemplate
2
+ from langchain.chains import create_retrieval_chain
3
+ from langchain.chains.combine_documents import create_stuff_documents_chain
4
+ import gradio as gr
5
+ from transformers import pipeline
6
+ import numpy as np
7
+ from langchain_ollama import OllamaLLM
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from load_document import load_data
10
+ from split_document import split_docs
11
+ from embed_docs import embed_docs
12
+ from retrieve import retrieve
13
+ from datetime import datetime
14
+ from js import js
15
+ from theme import theme
16
+ import os
17
+ import glob
18
+ import requests
19
+
20
+ # Initialize our speech pipeline
21
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device="cpu")
22
+ pretext = ""
23
+
24
+ def send_to_chat(question: str, history=None):
25
+ payload = {
26
+ "question": question
27
+ }
28
+ response = requests.post("https://sadiksmart0-the-law.hf.space/query", json=payload)
29
+ if response.status_code != 200:
30
+ print(f"Error {response.status_code}:")
31
+ return f"Error {response.status_code}: Okay Unable to fetch response from server."
32
+ return response.json().get("answer", "No answer returned.")
33
+ pretext = ""
34
+ def transcribe(audio):
35
+ global pretext
36
+ if audio is None:
37
+ return "Please record again. Loud and clear audio is required."
38
+ sr, y = audio
39
+
40
+
41
+ # Convert to mono if stereo
42
+ if y.ndim > 1:
43
+ y = y.mean(axis=1)
44
+ y = y.astype(np.float32)
45
+ y /= np.max(np.abs(y))
46
+
47
+ pretext = transcriber({"sampling_rate": sr, "raw": y})["text"]
48
+ return pretext
49
+
50
+ with gr.Blocks(title="Know The Law", theme=theme, js=js) as demo:
51
+
52
+ with gr.Row():
53
+ with gr.Column():
54
+ gr.Markdown("# Know The Law")
55
+
56
+ audio_input = gr.Audio(
57
+ label="Input Audio",
58
+ sources=["microphone"],
59
+ type="numpy",
60
+ container=True,
61
+ interactive=True,
62
+ waveform_options=gr.WaveformOptions(waveform_color="#B83A4B"),
63
+ )
64
+
65
+ output_text = gr.Textbox(
66
+ interactive=True,
67
+ submit_btn=True, # Enables submit event
68
+ label="Transcription Output",
69
+ visible=False # Made it invincible
70
+ )
71
+
72
+ audio_input.change(transcribe, inputs=[audio_input], outputs=[output_text])
73
+
74
+ gr.Markdown("# What does the Law say?")
75
+
76
+ chat = gr.ChatInterface(
77
+ send_to_chat,
78
+ chatbot=gr.Chatbot(height=300, type="messages"),
79
+ textbox=gr.Textbox(
80
+ placeholder="Ask me a question related to Nigerian law",
81
+ container=True,
82
+ scale=7,
83
+ submit_btn=True,
84
+ type="text"
85
+ ),
86
+ type="messages",
87
+ examples=[
88
+ "How can I file a complaint against police misconduct?",
89
+ "What is the process for obtaining a court order?",
90
+ "What are the legal requirements for starting a business in Nigeria?"
91
+ ],
92
+ title="Law Assistant",
93
+ run_examples_on_click=True,
94
+ save_history=True,
95
+ cache_examples=True
96
+ )
97
+
98
+ chat_input = chat.textbox # Get chatbot's text input
99
+
100
+ # Autofill chatbot input box with transcribed text
101
+ output_text.change(lambda x: x, inputs=[output_text], outputs=[chat_input])
102
+
103
+ # Submit event: When user presses Enter on output_text, it submits to chat
104
+ output_text.submit(send_to_chat, inputs=[output_text, chat.chatbot], outputs=chat.chatbot)
105
+
106
+ if __name__ == "__main__":
107
+ demo.launch(share=True, server_port=8001)
constitution.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84ee8e8a0fec7592a6176795762c35ddc93d9a12bed5799d3968c06e69e6ddcb
3
+ size 775326
embed_docs.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_community.vectorstores import FAISS
3
+ import os
4
+ from datetime import datetime
5
+
6
+ vector_store_path = "/home/user/VectorStoreDB"
7
+ index_name = "faiss_index"
8
+ full_index_path = os.path.join(vector_store_path, index_name)
9
+ start = ""
10
+ end = ""
11
+
12
+
13
+ def embed_docs(documents, embedder):
14
+
15
+ # Ensure the directory exists
16
+ os.makedirs(vector_store_path, exist_ok=True)
17
+
18
+ # just query if it exists
19
+ if os.path.exists(full_index_path):
20
+ print(f"Loading existing vector store at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
21
+ saved_vector = FAISS.load_local(full_index_path,
22
+ embeddings=embedder,
23
+ allow_dangerous_deserialization=True)
24
+
25
+ return saved_vector
26
+ else:
27
+ print(f"Embedding documents at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
28
+ embedded_vector = FAISS.from_documents(documents=documents, embedding=embedder)
29
+ embedded_vector.save_local(full_index_path)
30
+ print(f"Vector store saved at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
31
+
32
+ return embedded_vector
js.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ js = """
2
+ async function main() {
3
+ const script1 = document.createElement("script");
4
+ script1.src = "https://cdn.jsdelivr.net/npm/[email protected]/dist/ort.js";
5
+ document.head.appendChild(script1)
6
+ const script2 = document.createElement("script");
7
+ script2.onload = async () => {
8
+ console.log("vad loaded") ;
9
+ var record = document.querySelector('.record-button');
10
+ record.textContent = "Just Start Talking!"
11
+ record.style = "width: fit-content; padding-right: 0.5vw;"
12
+ const myvad = await vad.MicVAD.new({
13
+ onSpeechStart: () => {
14
+ var record = document.querySelector('.record-button');
15
+ var player = document.querySelector('#streaming-out')
16
+ if (record != null && (player == null || player.paused)) {
17
+ console.log(record);
18
+ record.click();
19
+ }
20
+ },
21
+ onSpeechEnd: (audio) => {
22
+ var stop = document.querySelector('.stop-button');
23
+ if (stop != null) {
24
+ console.log(stop);
25
+ stop.click();
26
+ }
27
+ }
28
+ })
29
+ myvad.start()
30
+ }
31
+ script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/[email protected]/dist/bundle.min.js";
32
+ script1.onload = () => {
33
+ console.log("onnx loaded")
34
+ document.head.appendChild(script2)
35
+ };
36
+ }
37
+ """
38
+
39
+ js_reset = """
40
+ () => {
41
+ var record = document.querySelector('.record-button');
42
+ record.textContent = "Just Start Talking!"
43
+ record.style = "width: fit-content; padding-right: 0.5vw;"
44
+ }
45
+ """
load_document.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from langchain_community.document_loaders import PDFPlumberLoader
4
+ from langchain_community.document_loaders import PyMuPDFLoader
5
+
6
+
7
+ # Load the PDF
8
+ def load_data(document):
9
+ loader = PDFPlumberLoader(document)
10
+ docs = loader.load()
11
+
12
+ return docs
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- fastapi
2
  gradio==5.31.0
3
  langchain==0.3.25
4
  langchain_community==0.3.24
@@ -8,16 +8,17 @@ langchain_huggingface==0.2.0
8
  langchain_ollama==0.3.3
9
  numpy==2.2.6
10
  Requests==2.32.3
11
- #selenium==4.32.0
12
- #transformers==4.46.2
13
  uvicorn==0.34.2
14
  torch
15
  torchvision
16
- #huggingface_hub[hf_xet]
17
  pdfplumber
18
  faiss-cpu
19
  numpy
20
  pydantic
21
  protobuf
22
  sentencepiece>=0.1.99
23
- accelerate>=0.26.0
 
 
1
+ fastapi[standard]==0.115.12
2
  gradio==5.31.0
3
  langchain==0.3.25
4
  langchain_community==0.3.24
 
8
  langchain_ollama==0.3.3
9
  numpy==2.2.6
10
  Requests==2.32.3
11
+ selenium==4.32.0
12
+ transformers==4.46.2
13
  uvicorn==0.34.2
14
  torch
15
  torchvision
16
+ huggingface_hub[hf_xet]
17
  pdfplumber
18
  faiss-cpu
19
  numpy
20
  pydantic
21
  protobuf
22
  sentencepiece>=0.1.99
23
+ accelerate>=0.26.0
24
+ pymupdf
retrieve.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def retrieve(saved_vector):
2
+ print("Retrieving similar documents...")
3
+ retriever = saved_vector.as_retriever(
4
+ search_type="similarity",
5
+ search_kwargs={"k": 3}
6
+ )
7
+ return retriever
split_document.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_experimental.text_splitter import SemanticChunker
2
+
3
+
4
+
5
+
6
+ def split_docs(docs, embedder):
7
+
8
+ # Split into chunks using the SemanticChunker with the embedder'
9
+ print("Splitting documents into chunks...")
10
+ text_splitter = SemanticChunker(embeddings=embedder)
11
+ documents = text_splitter.split_documents(docs)
12
+
13
+ return documents
theme.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ theme = gr.themes.Soft(
4
+ primary_hue=gr.themes.Color(
5
+ c100="#82000019",
6
+ c200="#82000033",
7
+ c300="#8200004c",
8
+ c400="#82000066",
9
+ c50="#8200007f",
10
+ c500="#8200007f",
11
+ c600="#82000099",
12
+ c700="#820000b2",
13
+ c800="#820000cc",
14
+ c900="#820000e5",
15
+ c950="#820000f2",
16
+ ),
17
+ secondary_hue="rose",
18
+ neutral_hue="stone",
19
+ )