RAGOndevice

Running on Zero

App Files Files Community

openfree commited on Mar 5

Commit

9affa6d

verified ·

1 Parent(s): c68a920

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -37

app.py CHANGED Viewed

@@ -30,6 +30,7 @@ import subprocess
 import pytesseract
 from pdf2image import convert_from_path
 import queue  # 추가: queue.Empty 예외 처리를 위해
 # -------------------- 추가: PDF to Markdown 변환 관련 import --------------------
 try:
@@ -545,10 +546,15 @@ def clear_cuda_memory():
 @spaces.GPU
 def load_model():
     try:
         loaded_model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=torch.bfloat16,
             device_map="auto",
         )
         return loaded_model
     except Exception as e:
@@ -628,19 +634,22 @@ def stream_chat(
         if len(history) > max_history_length:
             history = history[-max_history_length:]
         try:
             relevant_contexts = find_relevant_context(message)
-            wiki_context = "\n\n관련 위키피디아 정보:\n"
-            for ctx in relevant_contexts:
-                wiki_context += (
-                    f"Q: {ctx['question']}\n"
-                    f"A: {ctx['answer']}\n"
-                    f"유사도: {ctx['similarity']:.3f}\n\n"
-                )
         except Exception as e:
             print(f"컨텍스트 검색 오류: {str(e)}")
-            wiki_context = ""
         conversation = []
         for prompt, answer in history:
             conversation.extend([
@@ -648,43 +657,61 @@ def stream_chat(
                 {"role": "assistant", "content": answer}
             ])
-        final_message = file_context + wiki_context + "\n현재 질문: " + message
         conversation.append({"role": "user", "content": final_message})
         input_ids_str = build_prompt(conversation)
-        # 먼저 6000 토큰 이내로 자르기
-        input_ids_str = _truncate_tokens_for_context(input_ids_str, 6000)
-        inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
         max_context = 8192
-        input_length = inputs["input_ids"].shape[1]
-        remaining = max_context - input_length
-        min_generation = 128
-        # 만약 남은 토큰 수가 min_generation보다 적으면 입력을 추가로 자릅니다.
-        if remaining < min_generation:
             new_desired_input_length = max_context - min_generation
-            if new_desired_input_length < 1:
-                new_desired_input_length = 1
-            print(f"[주의] 입력이 너무 길어 input_length={input_length} -> {new_desired_input_length}로 재조정")
-            input_ids_str = _truncate_tokens_for_context(input_ids_str, new_desired_input_length)
-            inputs = tokenizer(input_ids_str, return_tensors="pt").to("cuda")
-            input_length = inputs["input_ids"].shape[1]
-            remaining = max_context - input_length
-        # max_new_tokens가 음수가 되지 않도록 보정
-        if remaining < 1:
-            remaining = 1
         if remaining < max_new_tokens:
-            print(f"[주의] 입력 토큰이 많아 max_new_tokens={max_new_tokens} -> {remaining}로 조정합니다.")
             max_new_tokens = remaining
         print(f"입력 텐서 생성 후 CUDA 메모리: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
         streamer = TextIteratorStreamer(
-            tokenizer, timeout=30., skip_prompt=True, skip_special_tokens=True
         )
         generate_kwargs = dict(
             **inputs,
             streamer=streamer,
@@ -694,23 +721,51 @@ def stream_chat(
             max_new_tokens=max_new_tokens,
             do_sample=True,
             temperature=temperature,
-            eos_token_id=255001,
         )
         clear_cuda_memory()
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
         buffer = ""
         try:
             for new_text in streamer:
                 buffer += new_text
                 yield "", history + [[message, buffer]]
-        except queue.Empty:
-            print("Streamer timed out. 최종 응답을 반환합니다.")
             yield "", history + [[message, buffer]]
         clear_cuda_memory()
     except Exception as e:
@@ -825,6 +880,10 @@ def create_demo():
         )
         file_upload.change(
             fn=init_msg,
             outputs=msg,
             queue=False
@@ -846,4 +905,4 @@ def create_demo():
 if __name__ == "__main__":
     demo = create_demo()
-    demo.launch()

 import pytesseract
 from pdf2image import convert_from_path
 import queue  # 추가: queue.Empty 예외 처리를 위해
+import time  # 추가: 스트리밍 타이밍을 위해
 # -------------------- 추가: PDF to Markdown 변환 관련 import --------------------
 try:
 @spaces.GPU
 def load_model():
     try:
+        # 메모리 정리 먼저 수행
+        clear_cuda_memory()
         loaded_model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=torch.bfloat16,
             device_map="auto",
+            # 낮은 메모리 사용을 위한 설정 추가
+            low_cpu_mem_usage=True,
         )
         return loaded_model
     except Exception as e:
         if len(history) > max_history_length:
             history = history[-max_history_length:]
+        # 위키피디아 컨텍스트 검색
+        wiki_context = ""
         try:
             relevant_contexts = find_relevant_context(message)
+            if relevant_contexts:  # 결과가 있을 경우만 추가
+                wiki_context = "\n\n관련 위키피디아 정보:\n"
+                for ctx in relevant_contexts:
+                    wiki_context += (
+                        f"Q: {ctx['question']}\n"
+                        f"A: {ctx['answer']}\n"
+                        f"유사도: {ctx['similarity']:.3f}\n\n"
+                    )
         except Exception as e:
             print(f"컨텍스트 검색 오류: {str(e)}")
+        # 대화 내역 구성
         conversation = []
         for prompt, answer in history:
             conversation.extend([
                 {"role": "assistant", "content": answer}
             ])
+        # 최종 메시지 구성
+        final_message = message
+        if file_context:
+            final_message = file_context + "\n현재 질문: " + message
+        if wiki_context:
+            final_message = wiki_context + "\n현재 질문: " + message
+        if file_context and wiki_context:
+            final_message = file_context + wiki_context + "\n현재 질문: " + message
         conversation.append({"role": "user", "content": final_message})
+        # 프롬프트 구성 및 토큰화
         input_ids_str = build_prompt(conversation)
+        # 먼저 컨텍스트 길이 확인 및 제한
         max_context = 8192
+        tokenized_input = tokenizer(input_ids_str, return_tensors="pt")
+        input_length = tokenized_input["input_ids"].shape[1]
+        # 컨텍스트가 너무 길면 자르기
+        if input_length > max_context - max_new_tokens:
+            print(f"입력이 너무 깁니다: {input_length} 토큰. 자르는 중...")
+            # 최소 생성 토큰 수 확보
+            min_generation = min(256, max_new_tokens)
             new_desired_input_length = max_context - min_generation
+            # 입력 텍스트를 토큰 단위로 자르기
+            tokens = tokenizer.encode(input_ids_str)
+            if len(tokens) > new_desired_input_length:
+                tokens = tokens[-new_desired_input_length:]
+                input_ids_str = tokenizer.decode(tokens)
+            # 다시 토큰화
+            tokenized_input = tokenizer(input_ids_str, return_tensors="pt")
+            input_length = tokenized_input["input_ids"].shape[1]
+        print(f"최종 입력 길이: {input_length} 토큰")
+        # CUDA로 입력 이동
+        inputs = tokenized_input.to("cuda")
+        # 남은 토큰 수 계산 및 max_new_tokens 조정
+        remaining = max_context - input_length
         if remaining < max_new_tokens:
+            print(f"max_new_tokens 조정: {max_new_tokens} -> {remaining}")
             max_new_tokens = remaining
         print(f"입력 텐서 생성 후 CUDA 메모리: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+        # 스트리머 설정
         streamer = TextIteratorStreamer(
+            tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
         )
+        # 생성 매개변수 설정
         generate_kwargs = dict(
             **inputs,
             streamer=streamer,
             max_new_tokens=max_new_tokens,
             do_sample=True,
             temperature=temperature,
+            eos_token_id=tokenizer.eos_token_id,  # 명시적 EOS 토큰 지정
         )
+        # 메모리 정리
         clear_cuda_memory()
+        # 별도 스레드에서 생성 실행
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
+        # 응답 스트리밍
         buffer = ""
+        partial_message = ""
+        last_yield_time = time.time()
         try:
             for new_text in streamer:
                 buffer += new_text
+                partial_message += new_text
+                # 일정 시간마다 또는 텍스트가 쌓일 때마다 결과 업데이트
+                current_time = time.time()
+                if current_time - last_yield_time > 0.1 or len(partial_message) > 20:
+                    yield "", history + [[message, buffer]]
+                    partial_message = ""
+                    last_yield_time = current_time
+            # 마지막 응답 확인
+            if buffer:
                 yield "", history + [[message, buffer]]
+            # 대화 기록에 저장
+            chat_history.add_conversation(message, buffer)
+        except Exception as e:
+            print(f"스트리밍 중 오류 발생: {str(e)}")
+            if not buffer:  # 버퍼가 비어있으면 오류 메시지 표시
+                buffer = f"응답 생성 중 오류가 발생했습니다: {str(e)}"
             yield "", history + [[message, buffer]]
+        # 스레드가 여전히 실행 중이면 종료 대기
+        if thread.is_alive():
+            thread.join(timeout=5.0)
+        # 메모리 정리
         clear_cuda_memory()
     except Exception as e:
         )
         file_upload.change(
+            fn=lambda: ("처리 중...", [["시스템", "파일을 분석 중입니다. 잠시만 기다려주세요..."]]),
+            outputs=[msg, chatbot],
+            queue=False
+        ).then(
             fn=init_msg,
             outputs=msg,
             queue=False
 if __name__ == "__main__":
     demo = create_demo()
+    demo.launch()