yosuke123456 commited on
Commit
5b175d6
ยท
verified ยท
1 Parent(s): 8fc89a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -21
app.py CHANGED
@@ -1,23 +1,20 @@
1
  # https://qiita.com/nekoniii3/items/5acf764af65212d9f04f
2
 
3
  import gradio as gr
4
- import random
5
- import time
6
 
7
  import os
8
 
9
  from langchain_community.document_loaders import PyMuPDFLoader
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- # from langchain_community.chat_models import ChatOpenAI
12
  from langchain_openai import ChatOpenAI
13
  from langchain_community.vectorstores import Chroma
14
  from langchain.chains import RetrievalQA
15
- from langchain_community.embeddings import OpenAIEmbeddings
16
-
17
 
18
 
19
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
- os.environ["OPENAI_API_KEY"] = "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"
21
 
22
  file_name1 = 'ALV2_ALV3DTUๆ“ไฝœใƒžใƒ‹ใƒฅใ‚ขใƒซDTU-V3SET01.pdf'
23
  file_name2 = 'ALV3PCใ‚ตใƒผใƒ_ใ‚ฝใƒ•ใƒˆใ‚ฆใ‚งใ‚ขๆ“ไฝœใƒžใƒ‹ใƒฅใ‚ขใƒซ_็”ปๅƒใƒ•ใ‚กใ‚คใƒซๅไป˜.pdf'
@@ -42,7 +39,8 @@ texts3 = text_splitter.split_documents(documents3)
42
  texts4 = text_splitter.split_documents(documents4)
43
  texts = texts1 + texts2 + texts3 + texts4
44
 
45
- embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
 
46
  vectordb = Chroma.from_documents(texts, embeddings)
47
  llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.05)
48
 
@@ -60,14 +58,45 @@ def save_image_filepath(filepath: str):
60
  shutil.copy(filepath, './filepath{}'.format(file_extension))
61
  pass
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  with gr.Blocks() as demo:
64
  chatbot = gr.Chatbot()
65
- # with gr.Row():
66
- # with gr.Column():
67
- # image_input_filepath = gr.Image(type='filepath')
68
- # image_button_filepath = gr.Button("filepath")
69
-
70
- # image_button_filepath.click(save_image_filepath, inputs=image_input_filepath)
71
 
72
  msg = gr.Textbox()
73
 
@@ -76,23 +105,56 @@ with gr.Blocks() as demo:
76
  reply=reply2['result']
77
 
78
  for sd in reply2["source_documents"]:
79
- page_content = str(sd.page_content)
80
  source = str(sd.metadata["source"])
81
- page = str(sd.metadata["page"]+1).zfill(3)
82
- print("PDF๏ผš" + source)
83
- print("ใƒšใƒผใ‚ธ๏ผš" + page)
 
84
 
85
- reply = reply + '<a href="https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg=108&v=MIW10001&d=LINK_MIW">link</a>'
86
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  return "", history + [[user_message, reply]]
88
 
89
  def bot(history):
90
  yield history
91
- # save_image_filepath("./IMG_yosuke2.jpg")
92
 
93
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then(
94
  bot, chatbot, chatbot
95
  )
96
 
97
  demo.queue()
98
- demo.launch(share=True)
 
1
  # https://qiita.com/nekoniii3/items/5acf764af65212d9f04f
2
 
3
  import gradio as gr
 
 
4
 
5
  import os
6
 
7
  from langchain_community.document_loaders import PyMuPDFLoader
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
9
  from langchain_openai import ChatOpenAI
10
  from langchain_community.vectorstores import Chroma
11
  from langchain.chains import RetrievalQA
12
+ # from langchain_openai import OpenAIEmbeddings
13
+ from langchain_community.embeddings import HuggingFaceEmbeddings
14
 
15
 
16
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
+ os.environ["OPENAI_API_KEY"] = "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"
18
 
19
  file_name1 = 'ALV2_ALV3DTUๆ“ไฝœใƒžใƒ‹ใƒฅใ‚ขใƒซDTU-V3SET01.pdf'
20
  file_name2 = 'ALV3PCใ‚ตใƒผใƒ_ใ‚ฝใƒ•ใƒˆใ‚ฆใ‚งใ‚ขๆ“ไฝœใƒžใƒ‹ใƒฅใ‚ขใƒซ_็”ปๅƒใƒ•ใ‚กใ‚คใƒซๅไป˜.pdf'
 
39
  texts4 = text_splitter.split_documents(documents4)
40
  texts = texts1 + texts2 + texts3 + texts4
41
 
42
+ # embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
43
+ embeddings = HuggingFaceEmbeddings(model_name="oshizo/sbert-jsnli-luke-japanese-base-lite")
44
  vectordb = Chroma.from_documents(texts, embeddings)
45
  llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.05)
46
 
 
58
  shutil.copy(filepath, './filepath{}'.format(file_extension))
59
  pass
60
 
61
+ import boto3
62
+ s3 = boto3.client('s3',
63
+ aws_access_key_id="AKIA6ENMUHYQ7KWAEV7Q",
64
+ aws_secret_access_key="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",
65
+ region_name='ap-northeast-1'
66
+ )
67
+
68
+
69
+ # ็”ปๅƒใฎURLๅ‡บๅŠ›ๆฉŸ่ƒฝ
70
+ def get_public_url(bucket, target_object_path):
71
+ """
72
+ ๅฏพ่ฑกใฎS3ใƒ•ใ‚กใ‚คใƒซใฎURLใ‚’ๅ–ๅพ—ใ™ใ‚‹
73
+
74
+ Parameters
75
+ ----------
76
+ bucket: string
77
+ S3ใฎใƒใ‚ฑใƒƒใƒˆๅ
78
+ target_object_path: string
79
+ ๅ–ๅพ—ใ—ใŸใ„S3ๅ†…ใฎใƒ•ใ‚กใ‚คใƒซใƒ‘ใ‚น
80
+
81
+ Returns
82
+ ----------
83
+ url: string
84
+ S3ไธŠใฎใ‚ชใƒ–ใ‚ธใ‚งใ‚ฏใƒˆใฎURL
85
+ """
86
+ bucket_location = s3.get_bucket_location(Bucket=bucket)
87
+ return "https://s3-{0}.amazonaws.com/{1}/{2}".format(
88
+ bucket_location['LocationConstraint'],
89
+ bucket,
90
+ target_object_path)
91
+
92
+ import fitz
93
+ doc1 = fitz.open(file_name1)
94
+ doc2 = fitz.open(file_name2)
95
+
96
+ import math
97
+
98
  with gr.Blocks() as demo:
99
  chatbot = gr.Chatbot()
 
 
 
 
 
 
100
 
101
  msg = gr.Textbox()
102
 
 
105
  reply=reply2['result']
106
 
107
  for sd in reply2["source_documents"]:
108
+ # page_content = str(sd.page_content)
109
  source = str(sd.metadata["source"])
110
+ page = sd.metadata["page"]+1
111
+ page_num = str(page).zfill(3)
112
+ # print("PDF๏ผš" + source)
113
+ # print("ใƒšใƒผใ‚ธ๏ผš" + page_num)
114
 
115
+ if source == file_name1:
116
+ # ใƒšใƒผใ‚ธ็”ปๅƒใฎURLใ‚’ๅ–ๅพ—
117
+ bucket='page.dtu.manual'
118
+ key='page'+page_num+'_raster.png'
119
+ url = get_public_url(bucket, key)
120
+ reply = reply + ' <a href='+url+'>'+page_num+'</a>'
121
+
122
+ elif source == file_name2:
123
+ # ใƒšใƒผใ‚ธ็”ปๅƒใฎURLใ‚’ๅ–ๅพ—
124
+ bucket='page.server.manual'
125
+ key='page'+page_num+'_raster.png'
126
+ url = get_public_url(bucket, key)
127
+ reply = reply + ' <a href='+url+'>'+page_num+'</a>'
128
+
129
+ # PDFใซ่ฒผใ‚Šไป˜ใ‘ใ‚ใ‚‹็”ปๅƒใฎURLใ‚’ๅ–ๅพ—
130
+ bucket='image.server.manual'
131
+ page2 = doc2[page]
132
+ page_annotations = page2.annots()
133
+ for annotation in page_annotations:
134
+ annotation_num = str(annotation).zfill(3)
135
+ # ๆณจ้‡ˆใฎใƒ—ใƒญใƒ‘ใƒ†ใ‚ฃใ‚’ๅ–ๅพ—
136
+ key = annotation.info.get('content', '') # ใƒŽใƒผใƒˆๆณจ้‡ˆใฎใƒ†ใ‚ญใ‚นใƒˆใ‚’ๅ–ๅพ—
137
+ url = get_public_url(bucket, key)
138
+ reply = reply + ' <a href='+url+'>'+key+'</a>'
139
+ elif source == file_name3:
140
+ page2 = str(math.floor(1+float(page_num)/2))
141
+ url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW"
142
+ reply = reply + ' <a href="'+url+'">'+page2+'</a>'
143
+ elif source == file_name4:
144
+ page2 = str(math.floor(1+(486+float(page_num))/2))
145
+ url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW"
146
+ reply = reply + ' <a href="'+url+'">'+page2+'</a>'
147
+ else:
148
+ exit(0)
149
+
150
  return "", history + [[user_message, reply]]
151
 
152
  def bot(history):
153
  yield history
 
154
 
155
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then(
156
  bot, chatbot, chatbot
157
  )
158
 
159
  demo.queue()
160
+ demo.launch(auth=("root", "test123456"), share=True)