Daoneeee commited on
Commit
acd4925
·
1 Parent(s): 887b79e

Update app.py

Browse files
Files changed (2) hide show
  1. app.py +30 -4
  2. facebook_chat.json +0 -217
app.py CHANGED
@@ -29,14 +29,40 @@ def get_pdf_text(pdf_docs):
29
  # 아래 텍스트 추출 함수를 작성
30
 
31
  def get_text_file(docs):
32
- pass
33
-
 
34
 
35
  def get_csv_file(docs):
36
- pass
 
 
 
 
 
 
 
 
 
 
37
 
38
  def get_json_file(docs):
39
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
  # 문서들을 처리하여 텍스트 청크로 나누는 함수입니다.
 
29
  # 아래 텍스트 추출 함수를 작성
30
 
31
  def get_text_file(docs):
32
+ # 텍스트 파일의 경우, 내용을 바로 읽어옵니다.
33
+ text_content = docs.getvalue().decode("utf-8") # UTF-8 인코딩을 기준으로 디코딩합니다.
34
+ return [text_content]
35
 
36
  def get_csv_file(docs):
37
+ # CSV 파일의 경우, 각 행 또는 열에서 텍스트를 추출합니다.
38
+ import pandas as pd
39
+ csv_content = docs.getvalue().decode("utf-8") # 바이트를 문자열로 디코딩합니다.
40
+ csv_data = pd.read_csv(pd.compat.StringIO(csv_content)) # Pandas를 사용하여 CSV를 읽어옵니다.
41
+ text_list = []
42
+
43
+ # 필요한 대로 각 열 또는 행에서 텍스트를 추출합니다.
44
+ for column in csv_data.columns:
45
+ text_list.extend(csv_data[column].astype(str).tolist())
46
+
47
+ return text_list
48
 
49
  def get_json_file(docs):
50
+ # JSON 파일의 경우, 특정 키 또는 값에서 텍스트를 추출합니다.
51
+ import json
52
+ json_content = docs.getvalue().decode("utf-8") # 바이트를 문자열로 디코딩합니다.
53
+ json_data = json.loads(json_content)
54
+
55
+ # 필요한 대로 JSON 키 또는 값에서 텍스트를 추출합니다.
56
+ text_list = []
57
+ for key, value in json_data.items():
58
+ if isinstance(value, str):
59
+ text_list.append(value)
60
+ elif isinstance(value, list):
61
+ text_list.extend(value)
62
+ elif isinstance(value, dict):
63
+ text_list.extend(value.values())
64
+
65
+ return text_list
66
 
67
 
68
  # 문서들을 처리하여 텍스트 청크로 나누는 함수입니다.
facebook_chat.json DELETED
@@ -1,217 +0,0 @@
1
- #!pip install jq
2
-
3
- from langchain.document_loaders import JSONLoader
4
-
5
- import json
6
- from pathlib import Path
7
- from pprint import pprint
8
-
9
-
10
- file_path='./example_data/facebook_chat.json'
11
- data = json.loads(Path(file_path).read_text())
12
-
13
- pprint(data)
14
-
15
- {'image': {'creation_timestamp': 1675549016, 'uri': 'image_of_the_chat.jpg'},
16
- 'is_still_participant': True,
17
- 'joinable_mode': {'link': '', 'mode': 1},
18
- 'magic_words': [],
19
- 'messages': [{'content': 'Bye!',
20
- 'sender_name': 'User 2',
21
- 'timestamp_ms': 1675597571851},
22
- {'content': 'Oh no worries! Bye',
23
- 'sender_name': 'User 1',
24
- 'timestamp_ms': 1675597435669},
25
- {'content': 'No Im sorry it was my mistake, the blue one is not '
26
- 'for sale',
27
- 'sender_name': 'User 2',
28
- 'timestamp_ms': 1675596277579},
29
- {'content': 'I thought you were selling the blue one!',
30
- 'sender_name': 'User 1',
31
- 'timestamp_ms': 1675595140251},
32
- {'content': 'Im not interested in this bag. Im interested in the '
33
- 'blue one!',
34
- 'sender_name': 'User 1',
35
- 'timestamp_ms': 1675595109305},
36
- {'content': 'Here is $129',
37
- 'sender_name': 'User 2',
38
- 'timestamp_ms': 1675595068468},
39
- {'photos': [{'creation_timestamp': 1675595059,
40
- 'uri': 'url_of_some_picture.jpg'}],
41
- 'sender_name': 'User 2',
42
- 'timestamp_ms': 1675595060730},
43
- {'content': 'Online is at least $100',
44
- 'sender_name': 'User 2',
45
- 'timestamp_ms': 1675595045152},
46
- {'content': 'How much do you want?',
47
- 'sender_name': 'User 1',
48
- 'timestamp_ms': 1675594799696},
49
- {'content': 'Goodmorning! $50 is too low.',
50
- 'sender_name': 'User 2',
51
- 'timestamp_ms': 1675577876645},
52
- {'content': 'Hi! Im interested in your bag. Im offering $50. Let '
53
- 'me know if you are interested. Thanks!',
54
- 'sender_name': 'User 1',
55
- 'timestamp_ms': 1675549022673}],
56
- 'participants': [{'name': 'User 1'}, {'name': 'User 2'}],
57
- 'thread_path': 'inbox/User 1 and User 2 chat',
58
- 'title': 'User 1 and User 2 chat'}
59
-
60
- loader = JSONLoader(
61
- file_path='./example_data/facebook_chat.json',
62
- jq_schema='.messages[].content',
63
- text_content=False)
64
-
65
- data = loader.load()
66
-
67
- pprint(data)
68
-
69
- [Document(page_content='Bye!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 1}),
70
- Document(page_content='Oh no worries! Bye', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 2}),
71
- Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 3}),
72
- Document(page_content='I thought you were selling the blue one!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 4}),
73
- Document(page_content='Im not interested in this bag. Im interested in the blue one!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 5}),
74
- Document(page_content='Here is $129', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 6}),
75
- Document(page_content='', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 7}),
76
- Document(page_content='Online is at least $100', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 8}),
77
- Document(page_content='How much do you want?', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 9}),
78
- Document(page_content='Goodmorning! $50 is too low.', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 10}),
79
- Document(page_content='Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 11})]
80
-
81
-
82
-
83
-
84
-
85
- file_path = './example_data/facebook_chat_messages.jsonl'
86
- pprint(Path(file_path).read_text())
87
-
88
- ('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
89
- '{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
90
- 'worries! Bye"}\n'
91
- '{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
92
- 'sorry it was my mistake, the blue one is not for sale"}\n')
93
-
94
- loader = JSONLoader(
95
- file_path='./example_data/facebook_chat_messages.jsonl',
96
- jq_schema='.content',
97
- text_content=False,
98
- json_lines=True)
99
-
100
- data = loader.load()
101
-
102
- pprint(data)
103
-
104
- [Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
105
- Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
106
- Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
107
-
108
-
109
-
110
-
111
-
112
- loader = JSONLoader(
113
- file_path='./example_data/facebook_chat_messages.jsonl',
114
- jq_schema='.',
115
- content_key='sender_name',
116
- json_lines=True)
117
-
118
- data = loader.load()
119
-
120
- pprint(data)
121
-
122
- [Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
123
- Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
124
- Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
125
-
126
-
127
-
128
-
129
-
130
- .messages[].content
131
-
132
- .messages[]
133
-
134
- # Define the metadata extraction function.
135
- def metadata_func(record: dict, metadata: dict) -> dict:
136
-
137
- metadata["sender_name"] = record.get("sender_name")
138
- metadata["timestamp_ms"] = record.get("timestamp_ms")
139
-
140
- return metadata
141
-
142
-
143
- loader = JSONLoader(
144
- file_path='./example_data/facebook_chat.json',
145
- jq_schema='.messages[]',
146
- content_key="content",
147
- metadata_func=metadata_func
148
- )
149
-
150
- data = loader.load()
151
-
152
- pprint(data)
153
-
154
- [Document(page_content='Bye!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}),
155
- Document(page_content='Oh no worries! Bye', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 2, 'sender_name': 'User 1', 'timestamp_ms': 1675597435669}),
156
- Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 3, 'sender_name': 'User 2', 'timestamp_ms': 1675596277579}),
157
- Document(page_content='I thought you were selling the blue one!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 4, 'sender_name': 'User 1', 'timestamp_ms': 1675595140251}),
158
- Document(page_content='Im not interested in this bag. Im interested in the blue one!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 5, 'sender_name': 'User 1', 'timestamp_ms': 1675595109305}),
159
- Document(page_content='Here is $129', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 6, 'sender_name': 'User 2', 'timestamp_ms': 1675595068468}),
160
- Document(page_content='', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 7, 'sender_name': 'User 2', 'timestamp_ms': 1675595060730}),
161
- Document(page_content='Online is at least $100', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 8, 'sender_name': 'User 2', 'timestamp_ms': 1675595045152}),
162
- Document(page_content='How much do you want?', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 9, 'sender_name': 'User 1', 'timestamp_ms': 1675594799696}),
163
- Document(page_content='Goodmorning! $50 is too low.', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 10, 'sender_name': 'User 2', 'timestamp_ms': 1675577876645}),
164
- Document(page_content='Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!', metadata={'source': '/Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 11, 'sender_name': 'User 1', 'timestamp_ms': 1675549022673})]
165
-
166
-
167
-
168
-
169
-
170
-
171
-
172
- # Define the metadata extraction function.
173
- def metadata_func(record: dict, metadata: dict) -> dict:
174
-
175
- metadata["sender_name"] = record.get("sender_name")
176
- metadata["timestamp_ms"] = record.get("timestamp_ms")
177
-
178
- if "source" in metadata:
179
- source = metadata["source"].split("/")
180
- source = source[source.index("langchain"):]
181
- metadata["source"] = "/".join(source)
182
-
183
- return metadata
184
-
185
-
186
- loader = JSONLoader(
187
- file_path='./example_data/facebook_chat.json',
188
- jq_schema='.messages[]',
189
- content_key="content",
190
- metadata_func=metadata_func
191
- )
192
-
193
- data = loader.load()
194
-
195
- pprint(data)
196
-
197
- [Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 1, 'sender_name': 'User 2', 'timestamp_ms': 1675597571851}),
198
- Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 2, 'sender_name': 'User 1', 'timestamp_ms': 1675597435669}),
199
- Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 3, 'sender_name': 'User 2', 'timestamp_ms': 1675596277579}),
200
- Document(page_content='I thought you were selling the blue one!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 4, 'sender_name': 'User 1', 'timestamp_ms': 1675595140251}),
201
- Document(page_content='Im not interested in this bag. Im interested in the blue one!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 5, 'sender_name': 'User 1', 'timestamp_ms': 1675595109305}),
202
- Document(page_content='Here is $129', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 6, 'sender_name': 'User 2', 'timestamp_ms': 1675595068468}),
203
- Document(page_content='', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 7, 'sender_name': 'User 2', 'timestamp_ms': 1675595060730}),
204
- Document(page_content='Online is at least $100', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 8, 'sender_name': 'User 2', 'timestamp_ms': 1675595045152}),
205
- Document(page_content='How much do you want?', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 9, 'sender_name': 'User 1', 'timestamp_ms': 1675594799696}),
206
- Document(page_content='Goodmorning! $50 is too low.', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 10, 'sender_name': 'User 2', 'timestamp_ms': 1675577876645}),
207
- Document(page_content='Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json', 'seq_num': 11, 'sender_name': 'User 1', 'timestamp_ms': 1675549022673})]
208
-
209
-
210
- JSON -> [{"text": ...}, {"text": ...}, {"text": ...}]
211
- jq_schema -> ".[].text"
212
-
213
- JSON -> {"key": [{"text": ...}, {"text": ...}, {"text": ...}]}
214
- jq_schema -> ".key[].text"
215
-
216
- JSON -> ["...", "...", "..."]
217
- jq_schema -> ".[]"