Spaces:
Running
Running
Commit
·
7976e52
1
Parent(s):
727d245
Add application file
Browse files- .gitignore +1 -0
- Dockerfile +14 -0
- ingest.py +25 -0
- requirements.txt +5 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Data
|
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.9
|
5 |
+
|
6 |
+
WORKDIR /code
|
7 |
+
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
11 |
+
|
12 |
+
COPY . .
|
13 |
+
|
14 |
+
CMD ["docker run -p 6333:6333 qdrant/qdrant", "python ingest.py"]
|
ingest.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
+
from langchain.document_loaders import DirectoryLoader
|
3 |
+
from langchain.document_loaders import PyPDFLoader
|
4 |
+
from langchain.vectorstores import Qdrant
|
5 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
6 |
+
|
7 |
+
# embeddings = SentenceTransformerEmbeddings(model_name='NeuML/pubmedbert-base-embeddings')
|
8 |
+
# embeddings = SentenceTransformerEmbeddings(model_name='mixedbread-ai/mxbai-embed-large-v1')
|
9 |
+
embeddings = SentenceTransformerEmbeddings(model_name='BAAI/bge-large-en')
|
10 |
+
|
11 |
+
print(embeddings)
|
12 |
+
|
13 |
+
loader = DirectoryLoader('Data/', glob='110106081.pdf', show_progress=True, loader_cls=PyPDFLoader)\
|
14 |
+
|
15 |
+
documents = loader.load()
|
16 |
+
|
17 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
18 |
+
|
19 |
+
texts = text_splitter.split_documents(documents)
|
20 |
+
|
21 |
+
url = "http://localhost:6333/"
|
22 |
+
|
23 |
+
qdrant = Qdrant.from_documents(texts, embeddings, url=url, prefer_grpc=False, collection_name="patent_database")
|
24 |
+
|
25 |
+
print("Vector Database created")
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
PyPDFLoader
|
3 |
+
Qdrant
|
4 |
+
SentenceTransformerEmbeddings
|
5 |
+
langchain_community
|