Spaces:

Sadiksmart0
/

DeLaw_ollama

Sleeping

App Files Files Community

Sadiksmart0 commited on May 26

Commit

74d8f71

verified ·

1 Parent(s): 1d3516c

Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +4 -4
app.py +30 -9
app_ui.py +107 -0
constitution.pdf +3 -0
embed_docs.py +32 -0
js.py +45 -0
load_document.py +12 -0
requirements.txt +6 -5
retrieve.py +7 -0
split_document.py +13 -0
theme.py +19 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+constitution.pdf filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: DeLaw Ollama
-emoji: 👀
-colorFrom: yellow
-colorTo: purple
 sdk: docker
 pinned: false
 ---

 ---
+title: The Law
+emoji: 🦀
+colorFrom: purple
+colorTo: pink
 sdk: docker
 pinned: false
 ---

app.py CHANGED Viewed

@@ -1,16 +1,16 @@
 from langchain_core.prompts import PromptTemplate
 from langchain.chains import create_retrieval_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
-# import gradio as gr
-# from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import numpy as np
 from langchain_ollama import OllamaLLM
 from langchain_huggingface import HuggingFaceEmbeddings
-# from langchain_community.llms import HuggingFacePipeline
-# from load_document import load_data
-# from split_document import split_docs
-# from embed_docs import embed_docs
-# from retrieve import retrieve
 from datetime import datetime
 # from js import js
 # from theme import theme
@@ -41,12 +41,33 @@ embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6
 def fetch_doc():
     # Adjust the path as needed, e.g., './' for current directory
     pdf_files = glob.glob("Document/*.pdf")
     return pdf_files
 # # Define llm
 hf_token = os.environ.get("HF_TOKEN").strip()  # Ensure to set your Hugging Face token in the environment variable HF_TOKEN
-llm = OllamaLLM(model="mistral:7b-instruct")
 pdf_files = fetch_doc() #Fetch Dataset
 chunks = None

 from langchain_core.prompts import PromptTemplate
 from langchain.chains import create_retrieval_chain
 from langchain.chains.combine_documents import create_stuff_documents_chain
+import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import numpy as np
 from langchain_ollama import OllamaLLM
 from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.llms import HuggingFacePipeline
+from load_document import load_data
+from split_document import split_docs
+from embed_docs import embed_docs
+from retrieve import retrieve
 from datetime import datetime
 # from js import js
 # from theme import theme
 def fetch_doc():
     # Adjust the path as needed, e.g., './' for current directory
     pdf_files = glob.glob("Document/*.pdf")
+    # If you want to include subdirectories:
+    # pdf_files = glob.glob("**/*.pdf", recursive=True)
     return pdf_files
 # # Define llm
 hf_token = os.environ.get("HF_TOKEN").strip()  # Ensure to set your Hugging Face token in the environment variable HF_TOKEN
+# #llm = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.3", device="cpu", use_auth_token=hf_token, token=hf_token)
+# #llm = OllamaLLM(model="mistral:7b-instruct", base_url="http://host.docker.internal:11434")
+model_id = "google/gemma-2b-it"
+# # Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", torch_dtype="auto", token=hf_token)
+# # Create text generation pipeline
+hf_pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=512,
+    temperature=0.7,
+    top_p=0.9,
+    do_sample=True
+)
+llm = HuggingFacePipeline(pipeline=hf_pipe)
 pdf_files = fetch_doc() #Fetch Dataset
 chunks = None

app_ui.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from langchain_core.prompts import PromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+import gradio as gr
+from transformers import pipeline
+import numpy as np
+from langchain_ollama import OllamaLLM
+from langchain_huggingface import HuggingFaceEmbeddings
+from load_document import load_data
+from split_document import split_docs
+from embed_docs import embed_docs
+from retrieve import retrieve
+from datetime import datetime
+from js import js
+from theme import theme
+import os
+import glob
+import requests
+# Initialize our speech pipeline
+transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device="cpu")
+pretext = ""
+def send_to_chat(question: str, history=None):
+    payload = {
+        "question": question
+    }
+    response = requests.post("https://sadiksmart0-the-law.hf.space/query", json=payload)
+    if response.status_code != 200:
+        print(f"Error {response.status_code}:")
+        return f"Error {response.status_code}: Okay Unable to fetch response from server."
+    return response.json().get("answer", "No answer returned.")
+pretext = ""
+def transcribe(audio):
+    global pretext
+    if audio is None:
+        return "Please record again. Loud and clear audio is required."
+    sr, y = audio
+    # Convert to mono if stereo
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y))
+    pretext = transcriber({"sampling_rate": sr, "raw": y})["text"]
+    return pretext
+with gr.Blocks(title="Know The Law", theme=theme, js=js) as demo:
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("# Know The Law")
+            audio_input = gr.Audio(
+                label="Input Audio",
+                sources=["microphone"],
+                type="numpy",
+                container=True,
+                interactive=True,
+                waveform_options=gr.WaveformOptions(waveform_color="#B83A4B"),
+            )
+            output_text = gr.Textbox(
+                interactive=True,
+                submit_btn=True,  # Enables submit event
+                label="Transcription Output",
+                visible=False # Made it invincible
+            )
+            audio_input.change(transcribe, inputs=[audio_input], outputs=[output_text])
+            gr.Markdown("# What does the Law say?")
+            chat = gr.ChatInterface(
+                send_to_chat,
+                chatbot=gr.Chatbot(height=300, type="messages"),
+                textbox=gr.Textbox(
+                    placeholder="Ask me a question related to Nigerian law",
+                    container=True,
+                    scale=7,
+                    submit_btn=True,
+                    type="text"
+                ),
+                type="messages",
+                examples=[
+                    "How can I file a complaint against police misconduct?",
+                    "What is the process for obtaining a court order?",
+                    "What are the legal requirements for starting a business in Nigeria?"
+                ],
+                title="Law Assistant",
+                run_examples_on_click=True,
+                save_history=True,
+                cache_examples=True
+            )
+            chat_input = chat.textbox  # Get chatbot's text input
+            # Autofill chatbot input box with transcribed text
+            output_text.change(lambda x: x, inputs=[output_text], outputs=[chat_input])
+            # Submit event: When user presses Enter on output_text, it submits to chat
+            output_text.submit(send_to_chat, inputs=[output_text, chat.chatbot], outputs=chat.chatbot)
+if __name__ == "__main__":
+  demo.launch(share=True, server_port=8001)

constitution.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84ee8e8a0fec7592a6176795762c35ddc93d9a12bed5799d3968c06e69e6ddcb
+size 775326

embed_docs.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from langchain_community.vectorstores import FAISS
+import os
+from datetime import datetime
+vector_store_path = "/home/user/VectorStoreDB"
+index_name = "faiss_index"
+full_index_path = os.path.join(vector_store_path, index_name)
+start = ""
+end = ""
+def embed_docs(documents, embedder):
+    # Ensure the directory exists
+    os.makedirs(vector_store_path, exist_ok=True)
+    # just query if it exists
+    if os.path.exists(full_index_path):
+        print(f"Loading existing vector store at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        saved_vector = FAISS.load_local(full_index_path,
+                                        embeddings=embedder,
+                                        allow_dangerous_deserialization=True)
+        return saved_vector
+    else:
+        print(f"Embedding documents at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        embedded_vector = FAISS.from_documents(documents=documents, embedding=embedder)
+        embedded_vector.save_local(full_index_path)
+        print(f"Vector store saved at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        return embedded_vector

js.py ADDED Viewed

	@@ -0,0 +1,45 @@

+js = """
+async function main() {
+  const script1 = document.createElement("script");
+  script1.src = "https://cdn.jsdelivr.net/npm/[email protected]/dist/ort.js";
+  document.head.appendChild(script1)
+  const script2 = document.createElement("script");
+  script2.onload = async () =>  {
+    console.log("vad loaded") ;
+    var record = document.querySelector('.record-button');
+    record.textContent = "Just Start Talking!"
+    record.style = "width: fit-content; padding-right: 0.5vw;"
+    const myvad = await vad.MicVAD.new({
+      onSpeechStart: () => {
+        var record = document.querySelector('.record-button');
+        var player = document.querySelector('#streaming-out')
+        if (record != null && (player == null || player.paused)) {
+          console.log(record);
+          record.click();
+        }
+      },
+      onSpeechEnd: (audio) => {
+        var stop = document.querySelector('.stop-button');
+        if (stop != null) {
+          console.log(stop);
+          stop.click();
+        }
+      }
+    })
+    myvad.start()
+  }
+  script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/[email protected]/dist/bundle.min.js";
+  script1.onload = () =>  {
+    console.log("onnx loaded")
+    document.head.appendChild(script2)
+  };
+}
+"""
+js_reset = """
+() => {
+  var record = document.querySelector('.record-button');
+  record.textContent = "Just Start Talking!"
+  record.style = "width: fit-content; padding-right: 0.5vw;"
+}
+"""

load_document.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from langchain_community.document_loaders import PDFPlumberLoader
+from langchain_community.document_loaders import PyMuPDFLoader
+# Load the PDF
+def load_data(document):
+    loader = PDFPlumberLoader(document)
+    docs = loader.load()
+    return docs

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-fastapi
 gradio==5.31.0
 langchain==0.3.25
 langchain_community==0.3.24
@@ -8,16 +8,17 @@ langchain_huggingface==0.2.0
 langchain_ollama==0.3.3
 numpy==2.2.6
 Requests==2.32.3
-#selenium==4.32.0
-#transformers==4.46.2
 uvicorn==0.34.2
 torch
 torchvision
-#huggingface_hub[hf_xet]
 pdfplumber
 faiss-cpu
 numpy
 pydantic
 protobuf
 sentencepiece>=0.1.99
-accelerate>=0.26.0

+fastapi[standard]==0.115.12
 gradio==5.31.0
 langchain==0.3.25
 langchain_community==0.3.24
 langchain_ollama==0.3.3
 numpy==2.2.6
 Requests==2.32.3
+selenium==4.32.0
+transformers==4.46.2
 uvicorn==0.34.2
 torch
 torchvision
+huggingface_hub[hf_xet]
 pdfplumber
 faiss-cpu
 numpy
 pydantic
 protobuf
 sentencepiece>=0.1.99
+accelerate>=0.26.0
+pymupdf

retrieve.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def retrieve(saved_vector):
+    print("Retrieving similar documents...")
+    retriever = saved_vector.as_retriever(
+        search_type="similarity",
+        search_kwargs={"k": 3}
+        )
+    return retriever

split_document.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from langchain_experimental.text_splitter import SemanticChunker
+def split_docs(docs, embedder):
+    # Split into chunks using the SemanticChunker with the embedder'
+    print("Splitting documents into chunks...")
+    text_splitter = SemanticChunker(embeddings=embedder)
+    documents = text_splitter.split_documents(docs)
+    return documents

theme.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import gradio as gr
+theme = gr.themes.Soft(
+    primary_hue=gr.themes.Color(
+        c100="#82000019",
+        c200="#82000033",
+        c300="#8200004c",
+        c400="#82000066",
+        c50="#8200007f",
+        c500="#8200007f",
+        c600="#82000099",
+        c700="#820000b2",
+        c800="#820000cc",
+        c900="#820000e5",
+        c950="#820000f2",
+    ),
+    secondary_hue="rose",
+    neutral_hue="stone",
+)