rajsecrets0 commited on
Commit
82df4eb
·
1 Parent(s): d116d67

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip install -qU cassio datasets langchain openai tiktoken
2
+
3
+ # LangChain components to use
4
+ from langchain.vectorstores.cassandra import Cassandra
5
+ from langchain.indexes.vectorstore import VectorStoreIndexWrapper
6
+ from langchain.llms import OpenAI
7
+ from langchain.embeddings import OpenAIEmbeddings
8
+
9
+ # Support for dataset retrieval with Hugging Face
10
+ from datasets import load_dataset
11
+
12
+ # With CassIO, the engine powering the Astra DB integration in LangChain,
13
+ # you will also initialize the DB connection:
14
+ import cassio
15
+
16
+ pip install PyPDF2
17
+
18
+ from PyPDF2 import PdfReader
19
+
20
+ ASTRA_DB_APPLICATION_TOKEN = "AstraCS:OsOjMKLLxkWFoUpmNbWeJwIP:d8b4df7fd17c288edd265f9d167fa821e97e9d97098842c2e3ed4140d756d02d"
21
+ ASTRA_DB_ID = "f97bbcce-b48b-4b42-8ad0-fdc38b2e165e" # enter your Database ID
22
+ OPENAI_API_KEY = "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" # enter your OpenAI key
23
+
24
+ # provide the path of pdf file/files.
25
+ pdfreader = PdfReader('Ethics.pdf')
26
+
27
+ from typing_extensions import Concatenate
28
+ # read text from pdf
29
+ raw_text = ''
30
+ for i, page in enumerate(pdfreader.pages):
31
+ content = page.extract_text()
32
+ if content:
33
+ raw_text += content
34
+
35
+
36
+ cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
37
+
38
+ llm = OpenAI(openai_api_key=OPENAI_API_KEY)
39
+ embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
40
+
41
+ astra_vector_store = Cassandra(
42
+ embedding=embedding,
43
+ table_name="qa_mini_demo",
44
+ session=None,
45
+ keyspace=None,
46
+ )
47
+
48
+
49
+ from langchain.text_splitter import CharacterTextSplitter
50
+ # We need to split the text using Character Text Split such that it sshould not increse token size
51
+ text_splitter = CharacterTextSplitter(
52
+ separator = "\n",
53
+ chunk_size = 800,
54
+ chunk_overlap = 200,
55
+ length_function = len,
56
+ )
57
+ texts = text_splitter.split_text(raw_text)
58
+
59
+
60
+
61
+ astra_vector_store.add_texts(texts[:])
62
+
63
+ print("Inserted %i headlines." % len(texts[:]))
64
+
65
+ astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)
66
+
67
+ first_question = True
68
+ while True:
69
+ if first_question:
70
+ query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
71
+ else:
72
+ query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()
73
+
74
+ if query_text.lower() == "quit":
75
+ break
76
+
77
+ if query_text == "":
78
+ continue
79
+
80
+ first_question = False
81
+
82
+ print("\nQUESTION: \"%s\"" % query_text)
83
+ answer = astra_vector_index.query(query_text, llm=llm).strip()
84
+ print("ANSWER: \"%s\"\n" % answer)