DDingcheol commited on
Commit
b035b13
Β·
1 Parent(s): e7ba74a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -1
app.py CHANGED
@@ -49,7 +49,7 @@ def get_csv_file(docs):
49
  def get_json_file(docs):
50
  pass
51
 
52
-
53
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
54
  def get_text_chunks(documents):
55
  text_splitter = RecursiveCharacterTextSplitter(
@@ -60,7 +60,25 @@ def get_text_chunks(documents):
60
 
61
  documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€.
62
  return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
 
 
 
 
 
 
 
63
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
66
  def get_vectorstore(text_chunks):
 
49
  def get_json_file(docs):
50
  pass
51
 
52
+ '''
53
  # λ¬Έμ„œλ“€μ„ μ²˜λ¦¬ν•˜μ—¬ ν…μŠ€νŠΈ 청크둜 λ‚˜λˆ„λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
54
  def get_text_chunks(documents):
55
  text_splitter = RecursiveCharacterTextSplitter(
 
60
 
61
  documents = text_splitter.split_documents(documents) # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€.
62
  return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
63
+ '''
64
+ def get_text_chunks(documents):
65
+ text_splitter = RecursiveCharacterTextSplitter(
66
+ chunk_size=1000, # 청크의 크기λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
67
+ chunk_overlap=200, # 청크 μ‚¬μ΄μ˜ 쀑볡을 μ§€μ •ν•©λ‹ˆλ‹€.
68
+ length_function=len # ν…μŠ€νŠΈμ˜ 길이λ₯Ό μΈ‘μ •ν•˜λŠ” ν•¨μˆ˜λ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
69
+ )
70
 
71
+ text_list = [] # 각 λ¬Έμ„œμ˜ ν…μŠ€νŠΈλ₯Ό 담을 리슀트λ₯Ό μƒμ„±ν•©λ‹ˆλ‹€.
72
+ for doc in documents:
73
+ try:
74
+ text = doc.getvalue().decode("utf-8") # 파일 λ‚΄μš©μ„ utf-8 ν˜•μ‹μœΌλ‘œ λ””μ½”λ”©ν•˜μ—¬ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•©λ‹ˆλ‹€.
75
+ text_list.append(text)
76
+ except Exception as e:
77
+ print(f"An error occurred while processing a document: {e}")
78
+
79
+ # λ¬Έμ„œλ“€μ„ 청크둜 λ‚˜λˆ•λ‹ˆλ‹€.
80
+ documents = text_splitter.split_documents(text_list)
81
+ return documents # λ‚˜λˆˆ 청크λ₯Ό λ°˜ν™˜ν•©λ‹ˆλ‹€.
82
 
83
  # ν…μŠ€νŠΈ μ²­ν¬λ“€λ‘œλΆ€ν„° 벑터 μŠ€ν† μ–΄λ₯Ό μƒμ„±ν•˜λŠ” ν•¨μˆ˜μž…λ‹ˆλ‹€.
84
  def get_vectorstore(text_chunks):