Chris4K commited on
Commit
8ce796d
·
1 Parent(s): 1197711

Update vector_store_retriever.py

Browse files
Files changed (1) hide show
  1. vector_store_retriever.py +29 -10
vector_store_retriever.py CHANGED
@@ -3,32 +3,51 @@ import os
3
  import gradio as gr
4
  import time
5
  from pydantic import BaseModel, Field
6
- from typing import Any, Optional, Dict, List
7
  from huggingface_hub import InferenceClient
8
- from langchain.llms.base import LLM
9
- from langchain.embeddings import HuggingFaceInstructEmbeddings
10
  from langchain.vectorstores import Chroma
11
  from dotenv import load_dotenv
12
- from transformers import AutoTokenizer
13
- from transformers import Tool
14
 
15
  load_dotenv()
16
 
17
  path_work = "."
18
  hf_token = os.getenv("HF")
19
 
20
- embeddings = HuggingFaceInstructEmbeddings(
21
- model_name="sentence-transformers/all-MiniLM-L6-v2",
22
- model_kwargs={"device": "cpu"}
23
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  vectordb = Chroma(
26
  persist_directory=path_work + '/new_papers',
27
- embedding_function=embeddings
28
  )
29
 
30
  retriever = vectordb.as_retriever(search_kwargs={"k": 2})#5
31
 
 
32
  class KwArgsModel(BaseModel):
33
  kwargs: Dict[str, Any] = Field(default_factory=dict)
34
 
 
3
  import gradio as gr
4
  import time
5
  from pydantic import BaseModel, Field
6
+ from typing import Any, Optional, Dict, List, Union
7
  from huggingface_hub import InferenceClient
8
+ from langchain.llms.base import LLM, Documents, Images, EmbeddingFunction, Embeddings
 
9
  from langchain.vectorstores import Chroma
10
  from dotenv import load_dotenv
11
+ from transformers import AutoTokenizer, AutoModel, Tool
 
12
 
13
  load_dotenv()
14
 
15
  path_work = "."
16
  hf_token = os.getenv("HF")
17
 
18
+ class HuggingFaceInstructEmbeddings(EmbeddingFunction):
19
+ def __init__(self, model_name: str, model_kwargs: Optional[Dict[str, Any]] = None):
20
+ self.model = AutoModel.from_pretrained(model_name, **(model_kwargs or {}))
21
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+
23
+ def __call__(self, input: Union[Documents, Images]) -> Embeddings:
24
+ if isinstance(input, Documents):
25
+ texts = [doc.text for doc in input]
26
+ embeddings = self._embed_text(texts)
27
+ else:
28
+ # Handle image embeddings if needed
29
+ pass
30
+
31
+ return embeddings
32
+
33
+ def _embed_text(self, texts: List[str]) -> Embeddings:
34
+ # Your existing logic for text embeddings using Hugging Face models...
35
+ inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
36
+ with torch.no_grad():
37
+ outputs = self.model(**inputs)
38
+ embeddings = outputs.last_hidden_state.mean(dim=1) # Adjust this based on your specific model
39
+
40
+ return embeddings
41
+
42
 
43
  vectordb = Chroma(
44
  persist_directory=path_work + '/new_papers',
45
+ embedding_function=HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
46
  )
47
 
48
  retriever = vectordb.as_retriever(search_kwargs={"k": 2})#5
49
 
50
+
51
  class KwArgsModel(BaseModel):
52
  kwargs: Dict[str, Any] = Field(default_factory=dict)
53