Spaces:

prithivMLmods
/

Imgscope-OCR-Mini

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

a5607ef

verified ·

1 Parent(s): 153d99a

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -21

app.py CHANGED Viewed

@@ -29,15 +29,11 @@ from huggingface_hub import snapshot_download
 from dotenv import load_dotenv
 load_dotenv()
-# ---------------------------
 # Set up device
-# ---------------------------
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"  # for SNAC and Orpheus TTS
-# ---------------------------
 # Load DeepHermes Llama (chat/LLM) model
-# ---------------------------
 hermes_model_id = "prithivMLmods/DeepHermes-3-Llama-3-3B-Preview-abliterated"
 hermes_llm_tokenizer = AutoTokenizer.from_pretrained(hermes_model_id)
 hermes_llm_model = AutoModelForCausalLM.from_pretrained(
@@ -47,9 +43,7 @@ hermes_llm_model = AutoModelForCausalLM.from_pretrained(
 )
 hermes_llm_model.eval()
-# ---------------------------
 # Load Qwen2-VL processor and model for multimodal tasks
-# ---------------------------
 MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
 # (If needed, you can pass extra arguments such as a size dict here if required.)
 processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
@@ -59,9 +53,7 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-# ---------------------------
 # Load Orpheus TTS model and SNAC for TTS synthesis
-# ---------------------------
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(tts_device)
@@ -93,17 +85,13 @@ orpheus_tts_model.to(tts_device)
 orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
 print(f"Orpheus TTS model loaded to {tts_device}")
-# ---------------------------
 # Some global parameters for chat and image generation
-# ---------------------------
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-# ---------------------------
 # Stable Diffusion XL setup
-# ---------------------------
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
@@ -126,9 +114,7 @@ if ENABLE_CPU_OFFLOAD:
 MAX_SEED = np.iinfo(np.int32).max
-# ---------------------------
 # Utility functions
-# ---------------------------
 def save_image(img: Image.Image) -> str:
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
@@ -223,9 +209,7 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# ---------------------------
 # New TTS functions (SNAC/Orpheus pipeline)
-# ---------------------------
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
@@ -307,9 +291,7 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
         print(f"Error generating speech: {e}")
         return None
-# ---------------------------
 # Main generate function for the chat interface
-# ---------------------------
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -501,9 +483,7 @@ def generate(
         final_response = "".join(outputs)
         yield final_response
-# ---------------------------
 # Gradio Interface
-# ---------------------------
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[

 from dotenv import load_dotenv
 load_dotenv()
 # Set up device
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"  # for SNAC and Orpheus TTS
 # Load DeepHermes Llama (chat/LLM) model
 hermes_model_id = "prithivMLmods/DeepHermes-3-Llama-3-3B-Preview-abliterated"
 hermes_llm_tokenizer = AutoTokenizer.from_pretrained(hermes_model_id)
 hermes_llm_model = AutoModelForCausalLM.from_pretrained(
 )
 hermes_llm_model.eval()
 # Load Qwen2-VL processor and model for multimodal tasks
 MODEL_ID_QWEN = "prithivMLmods/Qwen2-VL-OCR2-2B-Instruct"
 # (If needed, you can pass extra arguments such as a size dict here if required.)
 processor = AutoProcessor.from_pretrained(MODEL_ID_QWEN, trust_remote_code=True)
     torch_dtype=torch.float16
 ).to("cuda").eval()
 # Load Orpheus TTS model and SNAC for TTS synthesis
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(tts_device)
 orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
 print(f"Orpheus TTS model loaded to {tts_device}")
 # Some global parameters for chat and image generation
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 # Stable Diffusion XL setup
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH") #SG161222/RealVisXL_V5.0_Lightning
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
 USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
 ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 MAX_SEED = np.iinfo(np.int32).max
 # Utility functions
 def save_image(img: Image.Image) -> str:
     unique_name = str(uuid.uuid4()) + ".png"
     img.save(unique_name)
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 # New TTS functions (SNAC/Orpheus pipeline)
 def process_prompt(prompt, voice, tokenizer, device):
     prompt = f"{voice}: {prompt}"
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids
         print(f"Error generating speech: {e}")
         return None
 # Main generate function for the chat interface
 @spaces.GPU
 def generate(
     input_dict: dict,
         final_response = "".join(outputs)
         yield final_response
 # Gradio Interface
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[