kushagrasharma-13 commited on
Commit
7976e52
·
1 Parent(s): 727d245

Add application file

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. Dockerfile +14 -0
  3. ingest.py +25 -0
  4. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ Data
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ COPY . .
13
+
14
+ CMD ["docker run -p 6333:6333 qdrant/qdrant", "python ingest.py"]
ingest.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.document_loaders import DirectoryLoader
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from langchain.vectorstores import Qdrant
5
+ from langchain.embeddings import SentenceTransformerEmbeddings
6
+
7
+ # embeddings = SentenceTransformerEmbeddings(model_name='NeuML/pubmedbert-base-embeddings')
8
+ # embeddings = SentenceTransformerEmbeddings(model_name='mixedbread-ai/mxbai-embed-large-v1')
9
+ embeddings = SentenceTransformerEmbeddings(model_name='BAAI/bge-large-en')
10
+
11
+ print(embeddings)
12
+
13
+ loader = DirectoryLoader('Data/', glob='110106081.pdf', show_progress=True, loader_cls=PyPDFLoader)\
14
+
15
+ documents = loader.load()
16
+
17
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
18
+
19
+ texts = text_splitter.split_documents(documents)
20
+
21
+ url = "http://localhost:6333/"
22
+
23
+ qdrant = Qdrant.from_documents(texts, embeddings, url=url, prefer_grpc=False, collection_name="patent_database")
24
+
25
+ print("Vector Database created")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ langchain
2
+ PyPDFLoader
3
+ Qdrant
4
+ SentenceTransformerEmbeddings
5
+ langchain_community