halyn commited on
Commit
4ff376f
ยท
1 Parent(s): d682420

upload code

Browse files
Files changed (2) hide show
  1. app.py +161 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.chains.question_answering import load_qa_chain
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_community.llms import HuggingFacePipeline
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
10
+
11
+
12
+ # Global variables
13
+ knowledge_base = None
14
+ qa_chain = None
15
+
16
+ # PDF ํŒŒ์ผ ๋กœ๋“œ ๋ฐ ํ…์ŠคํŠธ ์ถ”์ถœ
17
+ def load_pdf(pdf_file):
18
+ pdf_reader = PdfReader(pdf_file)
19
+ text = "".join(page.extract_text() for page in pdf_reader.pages)
20
+ return text
21
+
22
+ # ํ…์ŠคํŠธ๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ• 
23
+ def split_text(text):
24
+ text_splitter = CharacterTextSplitter(
25
+ separator="\n",
26
+ chunk_size=1000,
27
+ chunk_overlap=200,
28
+ length_function=len
29
+ )
30
+ return text_splitter.split_text(text)
31
+
32
+ # FAISS ๋ฒกํ„ฐ ์ €์žฅ์†Œ ์ƒ์„ฑ
33
+ def create_knowledge_base(chunks):
34
+ model_name = "sentence-transformers/all-mpnet-base-v2" # ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ์„ ๋ช…์‹œ
35
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
36
+ return FAISS.from_texts(chunks, embeddings)
37
+
38
+ # Hugging Face ๋ชจ๋ธ ๋กœ๋“œ
39
+ def load_model():
40
+ model_name = "halyn/gemma2-2b-it-finetuned-paperqa" # ํ…์ŠคํŠธ ์ƒ์„ฑ ๋ชจ๋ธ ์‚ฌ์šฉ
41
+ tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False)
42
+ model = AutoModelForCausalLM.from_pretrained(model_name)
43
+ return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=150, temperature=0.1)
44
+
45
+ # QA ์ฒด์ธ ์„ค์ •
46
+ def setup_qa_chain():
47
+ global qa_chain
48
+ try:
49
+ pipe = load_model()
50
+ except Exception as e:
51
+ print(f"Error loading model: {e}")
52
+ return
53
+ llm = HuggingFacePipeline(pipeline=pipe)
54
+ qa_chain = load_qa_chain(llm, chain_type="stuff")
55
+
56
+ # ๋ฉ”์ธ ํŽ˜์ด์ง€ UI
57
+ def main_page():
58
+ st.title("Welcome to GemmaPaperQA")
59
+ st.subheader("Upload Your Paper")
60
+
61
+ paper = st.file_uploader("Upload Here!", type="pdf", label_visibility="hidden")
62
+ if paper:
63
+ st.write(f"Upload complete! File name: {paper.name}")
64
+ # ํŒŒ์ผ ํฌ๊ธฐ ํ™•์ธ
65
+ file_size = paper.size # ํŒŒ์ผ ํฌ๊ธฐ๋ฅผ ํŒŒ์ผ ํฌ์ธํ„ฐ ์ด๋™ ์—†์ด ํ™•์ธ
66
+ if file_size > 10 * 1024 * 1024: # 10MB ์ œํ•œ
67
+ st.error("File is too large! Please upload a file smaller than 10MB.")
68
+ return
69
+
70
+ # ์ค‘๊ฐ„ ํ™•์ธ ์ ˆ์ฐจ - PDF ๋‚ด์šฉ ๋ฏธ๋ฆฌ๋ณด๊ธฐ
71
+ with st.spinner('Processing PDF...'):
72
+ try:
73
+ paper.seek(0) # ํŒŒ์ผ ์ฝ๊ธฐ ํฌ์ธํ„ฐ๋ฅผ ์ฒ˜์Œ์œผ๋กœ ๋˜๋Œ๋ฆผ
74
+ contents = paper.read()
75
+ pdf_file = io.BytesIO(contents)
76
+ text = load_pdf(pdf_file)
77
+
78
+ # ํ…์ŠคํŠธ๊ฐ€ ์ถ”์ถœ๋˜์ง€ ์•Š์„ ๊ฒฝ์šฐ ์—๋Ÿฌ ์ฒ˜๋ฆฌ
79
+ if len(text.strip()) == 0:
80
+ st.error("The PDF appears to have no extractable text. Please check the file and try again.")
81
+ return
82
+
83
+ st.text_area("Preview of extracted text", text[:1000], height=200)
84
+ st.write(f"Total characters extracted: {len(text)}")
85
+ global knowledge_base
86
+ if st.button("Proceed with this file"):
87
+ chunks = split_text(text)
88
+ knowledge_base = create_knowledge_base(chunks)
89
+
90
+ if knowledge_base is None:
91
+ st.error("Failed to create knowledge base.")
92
+ return
93
+
94
+ setup_qa_chain()
95
+
96
+ st.session_state.paper_name = paper.name[:-4]
97
+ st.session_state.page = "chat"
98
+ st.success("PDF successfully processed! You can now ask questions.")
99
+
100
+ except Exception as e:
101
+ st.error(f"Failed to process the PDF: {str(e)}")
102
+
103
+
104
+ # ์ฑ„ํŒ… ํŽ˜์ด์ง€ UI
105
+ def chat_page():
106
+ st.title(f"Ask anything about {st.session_state.paper_name}")
107
+
108
+ if "messages" not in st.session_state:
109
+ st.session_state.messages = []
110
+
111
+ for message in st.session_state.messages:
112
+ with st.chat_message(message["role"]):
113
+ st.markdown(message["content"])
114
+
115
+ if prompt := st.chat_input("Chat here!"):
116
+ st.session_state.messages.append({"role": "user", "content": prompt})
117
+
118
+ with st.chat_message("user"):
119
+ st.markdown(prompt)
120
+
121
+ response = get_response_from_model(prompt)
122
+
123
+ with st.chat_message("assistant"):
124
+ st.markdown(response)
125
+
126
+ st.session_state.messages.append({"role": "assistant", "content": response})
127
+
128
+ if st.button("Go back to main page"):
129
+ st.session_state.page = "main"
130
+
131
+ # ๋ชจ๋ธ ์‘๋‹ต ์ฒ˜๋ฆฌ
132
+ def get_response_from_model(prompt):
133
+ try:
134
+ global knowledge_base, qa_chain
135
+ if not knowledge_base:
136
+ return "No PDF has been uploaded yet."
137
+ if not qa_chain:
138
+ return "QA chain is not initialized."
139
+
140
+ docs = knowledge_base.similarity_search(prompt)
141
+ response = qa_chain.run(input_documents=docs, question=prompt)
142
+
143
+ if "Helpful Answer:" in response:
144
+ response = response.split("Helpful Answer:")[1].strip()
145
+
146
+ return response
147
+ except Exception as e:
148
+ return f"Error: {str(e)}"
149
+
150
+ # ํŽ˜์ด์ง€ ์„ค์ •
151
+ if "page" not in st.session_state:
152
+ st.session_state.page = "main"
153
+
154
+ if "paper_name" not in st.session_state:
155
+ st.session_state.paper_name = ""
156
+
157
+ # ํŽ˜์ด์ง€ ๋ Œ๋”๋ง
158
+ if st.session_state.page == "main":
159
+ main_page()
160
+ elif st.session_state.page == "chat":
161
+ chat_page()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ langchain-huggingface
4
+ langchain==0.3.1
5
+ langchain-community==0.3.1
6
+ langchain-core==0.3.8
7
+ langchain-text-splitters==0.3.0
8
+ transformers==4.45.1
9
+ torch==2.4.1
10
+ faiss-cpu==1.8.0.post1
11
+ requests==2.32.3
12
+ huggingface-hub==0.25.1
13
+ sentence-transformers==3.1.1
14
+ peft==0.2.0