Spaces:
Runtime error
Runtime error
LOUIS SANNA
commited on
Commit
·
ec0786f
1
Parent(s):
aa5f7bd
feat(*): make the whole project run locally
Browse files- .gitattributes +3 -0
- .gitignore +1 -0
- README.md +31 -0
- app.py +5 -0
- chroma/chroma-embeddings.parquet +2 -2
- chroma/chroma-embeddings.parquet.tmp +0 -0
- chroma/index/{id_to_uuid_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → id_to_uuid_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} +2 -2
- chroma/index/{index_63b0b7b3-7308-4629-ba5b-af235fc19082.bin → index_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.bin} +2 -2
- chroma/index/{index_metadata_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → index_metadata_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} +1 -1
- chroma/index/{uuid_to_id_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → uuid_to_id_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl} +2 -2
- data/raw/cixiidae/Fulgoroidea1906-FaunaBritishIndiaCelonBurma-Distant.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1922-NewIndianHomoptera-Muir.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1934-NewLittleKnownFulgoroidea.Muir1934.pdf [missing#581, 587?].pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1942-HomoptèresChopardAfriqueOccidentale-Lallemand.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1945-LanterfliesTrinidadSiuthAmerica-Fennah.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1945-PintaliaEquitosaAteson-Fennah.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1952-FauneDeFrance-Ribaut .pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1954-CarolineIslands-Metcalf.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1957-DieZikadenAfghanist-Dlabola.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1958-MontNimba-Lallemand.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1965-NewSpeciesWestIndies-Fennah.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1967-Galapagos-Fennah.PDF +3 -0
- data/raw/cixiidae/Fulgoroidea1967-New LittleKnownSouthAfrica-Fennah.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1969-NewCaledonia-Fennah.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1982-ScientificResultsMountCameroonExpedition-VanStalle.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1985-EconomicInsectFaunaChina-ChouLuHuangWang.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea1985-NewSynonymiesCombinationsNewWorldFulgoroidea-Obrien 1985 3491.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea2008-FulgoromorphaSeychellesPreliminaryChecklis-Holzinger-LöckerLöcker.pdf +3 -0
- data/raw/cixiidae/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier.pdf +3 -0
- example.env +1 -0
- load.py +41 -0
- poetry.lock +0 -0
- pyproject.toml +22 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
chroma filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
chroma filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/raw/cixiidae/*.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data/raw/cixiidae/Fulgoroidea1967-Galapagos-Fennah.PDF filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
README.md
CHANGED
@@ -10,3 +10,34 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
|
14 |
+
|
15 |
+
### Desciption
|
16 |
+
|
17 |
+
This project use LLM to interact with documents describing fulgoroidea.
|
18 |
+
|
19 |
+
### Quick start
|
20 |
+
|
21 |
+
This project use poetry.
|
22 |
+
|
23 |
+
To install packages
|
24 |
+
|
25 |
+
```shell
|
26 |
+
poetry install
|
27 |
+
```
|
28 |
+
|
29 |
+
To lauch costum env:
|
30 |
+
```shell
|
31 |
+
poetry shell
|
32 |
+
```
|
33 |
+
|
34 |
+
Then to lauch the app:
|
35 |
+
|
36 |
+
```shell
|
37 |
+
python app.py
|
38 |
+
````
|
39 |
+
|
40 |
+
App can be found at http://127.0.0.1:7860/
|
41 |
+
|
42 |
+
|
43 |
+
|
app.py
CHANGED
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
|
2 |
from langchain.vectorstores import Chroma # for the vectorization part
|
3 |
from langchain.chains import ConversationalRetrievalChain
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
|
3 |
+
# Load environment variables from .env file
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
|
7 |
from langchain.vectorstores import Chroma # for the vectorization part
|
8 |
from langchain.chains import ConversationalRetrievalChain
|
chroma/chroma-embeddings.parquet
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86ce62daaa8745aef49791108519965e063de1a1caa0611fefba9915330a8d38
|
3 |
+
size 88840691
|
chroma/chroma-embeddings.parquet.tmp
ADDED
File without changes
|
chroma/index/{id_to_uuid_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → id_to_uuid_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38d1b4a85efa0adb626b6bef79faec65538f47da093d42fa4af15e6ad7bf8662
|
3 |
+
size 116066
|
chroma/index/{index_63b0b7b3-7308-4629-ba5b-af235fc19082.bin → index_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.bin}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b312b85e2e2ba964451568d56a9235dd5f990451d8272ea5f2f72bda972c2d8
|
3 |
+
size 22514608
|
chroma/index/{index_metadata_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → index_metadata_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 74
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:08928359a065acd02acc0dd47b309a4a5cf7716dfad604edb06a3a52de955544
|
3 |
size 74
|
chroma/index/{uuid_to_id_63b0b7b3-7308-4629-ba5b-af235fc19082.pkl → uuid_to_id_62dbdb42-07cb-451b-94f8-85ab4e0a3aa9.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92b9ed8b3ea549ce344b1171b5f50c3f636d33c7f58f6da5df0848c44e0f8cbb
|
3 |
+
size 135749
|
data/raw/cixiidae/Fulgoroidea1906-FaunaBritishIndiaCelonBurma-Distant.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47374edb165ec953eceab55609918f6e4585c20960c42ad3f68374901c4930a3
|
3 |
+
size 38324614
|
data/raw/cixiidae/Fulgoroidea1922-NewIndianHomoptera-Muir.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba42c7d09d4c5454270c50dc758edb295fccef72dbc8511670321256562c77a3
|
3 |
+
size 3574936
|
data/raw/cixiidae/Fulgoroidea1934-NewLittleKnownFulgoroidea.Muir1934.pdf [missing#581, 587?].pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55dd9ceeefdcb184212b50a74760db0f067ebd24084f3221a846d24a9b67d232
|
3 |
+
size 436216
|
data/raw/cixiidae/Fulgoroidea1942-HomoptèresChopardAfriqueOccidentale-Lallemand.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6fdb1a26498d804939d71b7ab325b57be3f3b560d246dad3a8b2b816c4a9e71f
|
3 |
+
size 188673
|
data/raw/cixiidae/Fulgoroidea1945-LanterfliesTrinidadSiuthAmerica-Fennah.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:000a43fb2e9d1560664f25b4a99038e2b0cb22b81e3be728198e981b13591a47
|
3 |
+
size 62543484
|
data/raw/cixiidae/Fulgoroidea1945-PintaliaEquitosaAteson-Fennah.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50efdb4949fde4efeefc915512af757b2a59ba91b261c5fd6fbd105fc62edc00
|
3 |
+
size 818526
|
data/raw/cixiidae/Fulgoroidea1952-FauneDeFrance-Ribaut .pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:247c3656080830a0b64765b3bd793ca0d8bd7d892970dd36fa2a5ae8cdab0976
|
3 |
+
size 43546668
|
data/raw/cixiidae/Fulgoroidea1954-CarolineIslands-Metcalf.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d59893291595ff0c2a3f33706e234d94c3fab35ddc0243a043e75a887ba0005a
|
3 |
+
size 3327485
|
data/raw/cixiidae/Fulgoroidea1957-DieZikadenAfghanist-Dlabola.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94ad30ec6e163bf204c0ebe6093453509c6a9a552a12d9e3d7492f31af88607e
|
3 |
+
size 11136356
|
data/raw/cixiidae/Fulgoroidea1958-MontNimba-Lallemand.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c3b8395eb2aff419b9e660e1bc3026c0e9f48e7e3e9699d3d6f928834c242fd9
|
3 |
+
size 621407
|
data/raw/cixiidae/Fulgoroidea1965-NewSpeciesWestIndies-Fennah.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:654a6efbbe8c6b743682a479cdf1b0a0d674d1b5a8e747d4a7e8eef192a912c4
|
3 |
+
size 1557342
|
data/raw/cixiidae/Fulgoroidea1967-Galapagos-Fennah.PDF
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e6d6509361be6c9926f0f94d58e609fe882b8f290c8666df8e6466fcb50cac0
|
3 |
+
size 1562170
|
data/raw/cixiidae/Fulgoroidea1967-New LittleKnownSouthAfrica-Fennah.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6cb0394f0d7215d88fd2bc8eb4c52ec2a5d9cba3cb314a29d9bc8d68167c96c0
|
3 |
+
size 2656903
|
data/raw/cixiidae/Fulgoroidea1969-NewCaledonia-Fennah.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dae6d0db9b5d3d1fa63a2d5d8b7e5c3e5a9723a6da8e6da479930d1b6a53c1d3
|
3 |
+
size 12344035
|
data/raw/cixiidae/Fulgoroidea1982-ScientificResultsMountCameroonExpedition-VanStalle.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c09eecd6eeb8b6e746ce81d358faec03f6f2d7acd64d4d6b828834ec6932afb4
|
3 |
+
size 5030611
|
data/raw/cixiidae/Fulgoroidea1985-EconomicInsectFaunaChina-ChouLuHuangWang.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:062434ec1f6eba2f1822a061aa25c731e25d80b66f8f88eb84d09159df926a35
|
3 |
+
size 50565258
|
data/raw/cixiidae/Fulgoroidea1985-NewSynonymiesCombinationsNewWorldFulgoroidea-Obrien 1985 3491.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38fa7c85f06fe661e56979493b1df3dd0c142cbfcc4dc9a2f5d797bab1b24389
|
3 |
+
size 763298
|
data/raw/cixiidae/Fulgoroidea2008-FulgoromorphaSeychellesPreliminaryChecklis-Holzinger-LöckerLöcker.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61dd802becbb724794a73e63b6fb38ba6bec931b199dfd5de67cc0a0fc6c43cc
|
3 |
+
size 155971
|
data/raw/cixiidae/Fulgoroidea2008-Mascareignes-AttiéBourgoinVeslotSoulier.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b45f8118894317bfc092afdf4ecc91cbddcb6e00710725a875d5c293dd4957c5
|
3 |
+
size 465630
|
example.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY=
|
load.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
|
3 |
+
# Load environment variables from .env file
|
4 |
+
load_dotenv()
|
5 |
+
|
6 |
+
from langchain.document_loaders import UnstructuredFileLoader # for loading the pdf
|
7 |
+
from langchain.embeddings import OpenAIEmbeddings # for creating embeddings
|
8 |
+
from langchain.vectorstores import Chroma # for the vectorization part
|
9 |
+
from langchain.chains import ConversationalRetrievalChain
|
10 |
+
from langchain.llms import OpenAI # the LLM model we'll use (CHatGPT)
|
11 |
+
from langchain.text_splitter import CharacterTextSplitter
|
12 |
+
from glob import glob
|
13 |
+
import os
|
14 |
+
|
15 |
+
DOCUMENT_PATH = "data/raw/cixiidae"
|
16 |
+
DB_DIR = "chroma"
|
17 |
+
|
18 |
+
pdf_files = glob(os.path.join(DOCUMENT_PATH, "*.pdf"))
|
19 |
+
documents = []
|
20 |
+
|
21 |
+
# Iterate through the list of PDF files
|
22 |
+
for file_path in pdf_files:
|
23 |
+
try:
|
24 |
+
loader = UnstructuredFileLoader(file_path)
|
25 |
+
document = loader.load()
|
26 |
+
documents.extend(document)
|
27 |
+
print(f"File added: {file_path}")
|
28 |
+
|
29 |
+
except Exception as e:
|
30 |
+
print(f"An error occurred while processing the file {file_path}: {str(e)}")
|
31 |
+
|
32 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
|
33 |
+
documents = text_splitter.split_documents(documents)
|
34 |
+
|
35 |
+
# Now, all_pages contains all the pages from every document
|
36 |
+
print(f'Total pages: {len(documents)}')
|
37 |
+
|
38 |
+
embeddings = OpenAIEmbeddings()
|
39 |
+
vectordb = Chroma.from_documents(documents, embedding=embeddings,
|
40 |
+
persist_directory=DB_DIR)
|
41 |
+
vectordb.persist()
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "datak"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["LOUIS SANNA <[email protected]>"]
|
6 |
+
readme = "README.md"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = "^3.11"
|
10 |
+
openai = "^0.27.6"
|
11 |
+
langchain = "^0.0.161"
|
12 |
+
chromadb = "^0.3.21"
|
13 |
+
gradio = "^3.28.3"
|
14 |
+
python-dotenv = "^1.0.0"
|
15 |
+
unstructured = "^0.6.3"
|
16 |
+
tiktoken = "^0.4.0"
|
17 |
+
pytesseract = "^0.3.10"
|
18 |
+
|
19 |
+
|
20 |
+
[build-system]
|
21 |
+
requires = ["poetry-core"]
|
22 |
+
build-backend = "poetry.core.masonry.api"
|