Spaces:

Omarrran
/

kashmiri_text_generation_trail

Sleeping

App Files Files Community

Omarrran commited on Oct 26, 2024

Commit

ccca270

verified ·

1 Parent(s): 7686669

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -50

app.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import gradio as gr
 import torch
 import json
-from transformers import GPT2Config
 from torch import nn
 import requests
 from pathlib import Path
 class TextGenerator(nn.Module):
-    def __init__(self, vocab_size, embedding_dim, hidden_dim):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
@@ -19,19 +23,29 @@ class TextGenerator(nn.Module):
         return self.fc(lstm_out)
 def download_file(url, local_path):
-    response = requests.get(url)
-    if response.status_code == 200:
         Path(local_path).parent.mkdir(parents=True, exist_ok=True)
         with open(local_path, 'wb') as f:
             f.write(response.content)
-    else:
-        raise Exception(f"Failed to download {url}")
 def load_model_and_tokenizers():
     # Create a local directory for downloaded files
     cache_dir = Path("model_cache")
     cache_dir.mkdir(exist_ok=True)
     # URLs for the files
     base_url = "https://huggingface.co/Omarrran/temp_data/raw/main"
     files = {
@@ -45,56 +59,83 @@ def load_model_and_tokenizers():
     for filename, url in files.items():
         local_path = cache_dir / filename
         if not local_path.exists():
-            print(f"Downloading {filename}...")
             download_file(url, local_path)
-    # Load configuration
-    with open(cache_dir / "model_config.json", "r") as f:
-        config = json.load(f)
-    # Load tokenizers
-    with open(cache_dir / "word_to_int.json", "r") as f:
-        word_to_int = json.load(f)
-    with open(cache_dir / "int_to_word.json", "r") as f:
-        int_to_word = json.load(f)
-    # Initialize model
-    model = TextGenerator(
-        vocab_size=config['vocab_size'],
-        embedding_dim=config['embedding_dim'],
-        hidden_dim=config['hidden_dim']
-    )
-    # Load model weights
-    model.load_state_dict(torch.load(cache_dir / "model.pt", map_location=torch.device('cpu')))
-    model.eval()
-    return model, word_to_int, int_to_word
 def generate_text(prompt, max_length=100):
-    # Load model and tokenizers (will use cached files after first load)
-    model, word_to_int, int_to_word = load_model_and_tokenizers()
-    # Tokenize input prompt
-    input_ids = [word_to_int.get(word, word_to_int['<UNK>']) for word in prompt.split()]
-    input_tensor = torch.tensor([input_ids])
-    # Generate text
-    generated_ids = input_ids.copy()
-    with torch.no_grad():
-        for _ in range(max_length):
-            current_input = torch.tensor([generated_ids[-50:]])  # Use last 50 tokens as context
-            outputs = model(current_input)
-            next_token_id = outputs[0, -1, :].argmax().item()
-            generated_ids.append(next_token_id)
-            if next_token_id == word_to_int.get('<EOS>', 0):
-                break
-    # Convert ids back to text
-    generated_text = ' '.join([int_to_word.get(str(idx), '<UNK>') for idx in generated_ids])
-    return generated_text
 # Create Gradio interface
 iface = gr.Interface(

 import gradio as gr
 import torch
 import json
 from torch import nn
 import requests
 from pathlib import Path
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class TextGenerator(nn.Module):
+    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=512):
         super().__init__()
         self.embedding = nn.Embedding(vocab_size, embedding_dim)
         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
         return self.fc(lstm_out)
 def download_file(url, local_path):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an exception for bad status codes
         Path(local_path).parent.mkdir(parents=True, exist_ok=True)
         with open(local_path, 'wb') as f:
             f.write(response.content)
+        logger.info(f"Successfully downloaded {url} to {local_path}")
+    except Exception as e:
+        logger.error(f"Error downloading {url}: {str(e)}")
+        raise
 def load_model_and_tokenizers():
     # Create a local directory for downloaded files
     cache_dir = Path("model_cache")
     cache_dir.mkdir(exist_ok=True)
+    # Default configuration values
+    default_config = {
+        'vocab_size': 10000,  # Default vocabulary size
+        'embedding_dim': 256,  # Default embedding dimension
+        'hidden_dim': 512     # Default hidden dimension
+    }
     # URLs for the files
     base_url = "https://huggingface.co/Omarrran/temp_data/raw/main"
     files = {
     for filename, url in files.items():
         local_path = cache_dir / filename
         if not local_path.exists():
+            logger.info(f"Downloading {filename}...")
             download_file(url, local_path)
+    try:
+        # Load configuration
+        with open(cache_dir / "model_config.json", "r") as f:
+            config = json.load(f)
+            # Merge with default config
+            for key in default_config:
+                if key not in config:
+                    logger.warning(f"Configuration parameter '{key}' not found, using default value: {default_config[key]}")
+                    config[key] = default_config[key]
+    except Exception as e:
+        logger.warning(f"Error loading config file: {str(e)}. Using default configuration.")
+        config = default_config
+    try:
+        # Load tokenizers
+        with open(cache_dir / "word_to_int.json", "r") as f:
+            word_to_int = json.load(f)
+        with open(cache_dir / "int_to_word.json", "r") as f:
+            int_to_word = json.load(f)
+        # Update vocab size based on actual vocabulary
+        config['vocab_size'] = len(word_to_int)
+    except Exception as e:
+        logger.error(f"Error loading tokenizer files: {str(e)}")
+        raise
+    try:
+        # Initialize model
+        model = TextGenerator(
+            vocab_size=config['vocab_size'],
+            embedding_dim=config['embedding_dim'],
+            hidden_dim=config['hidden_dim']
+        )
+        # Load model weights
+        model.load_state_dict(torch.load(cache_dir / "model.pt", map_location=torch.device('cpu')))
+        model.eval()
+        return model, word_to_int, int_to_word
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        raise
 def generate_text(prompt, max_length=100):
+    try:
+        # Load model and tokenizers
+        model, word_to_int, int_to_word = load_model_and_tokenizers()
+        # Tokenize input prompt
+        input_ids = [word_to_int.get(word, word_to_int.get('<UNK>', 0)) for word in prompt.split()]
+        input_tensor = torch.tensor([input_ids])
+        # Generate text
+        generated_ids = input_ids.copy()
+        with torch.no_grad():
+            for _ in range(max_length):
+                current_input = torch.tensor([generated_ids[-50:]])  # Use last 50 tokens as context
+                outputs = model(current_input)
+                next_token_id = outputs[0, -1, :].argmax().item()
+                generated_ids.append(next_token_id)
+                if next_token_id == word_to_int.get('<EOS>', 0):
+                    break
+        # Convert ids back to text
+        generated_text = ' '.join([int_to_word.get(str(idx), '<UNK>') for idx in generated_ids])
+        return generated_text
+    except Exception as e:
+        logger.error(f"Error in text generation: {str(e)}")
+        return f"Error generating text: {str(e)}"
 # Create Gradio interface
 iface = gr.Interface(