MagicMeWizard commited on
Commit
f9f65ef
·
verified ·
1 Parent(s): d0e21d2

Create config.py

Browse files
Files changed (1) hide show
  1. config.py +204 -0
config.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings for AI Web Scraper
3
+ Centralized configuration management for security, performance, and features
4
+ """
5
+
6
+ import os
7
+ from typing import Dict, List, Optional
8
+ from dataclasses import dataclass
9
+
10
+ @dataclass
11
+ class SecurityConfig:
12
+ """Security-related configuration"""
13
+ # URL validation settings
14
+ allowed_schemes: List[str] = None
15
+ blocked_domains: List[str] = None
16
+ max_url_length: int = 2048
17
+
18
+ # Rate limiting
19
+ requests_per_minute: int = 30
20
+ requests_per_hour: int = 500
21
+
22
+ # Content safety
23
+ max_content_size: int = 10 * 1024 * 1024 # 10MB
24
+ max_processing_time: int = 60 # seconds
25
+
26
+ def __post_init__(self):
27
+ if self.allowed_schemes is None:
28
+ self.allowed_schemes = ['http', 'https']
29
+
30
+ if self.blocked_domains is None:
31
+ self.blocked_domains = [
32
+ 'localhost', '127.0.0.1', '0.0.0.0',
33
+ '192.168.', '10.', '172.16.', '172.17.',
34
+ '172.18.', '172.19.', '172.20.', '172.21.',
35
+ '172.22.', '172.23.', '172.24.', '172.25.',
36
+ '172.26.', '172.27.', '172.28.', '172.29.',
37
+ '172.30.', '172.31.'
38
+ ]
39
+
40
+ @dataclass
41
+ class ModelConfig:
42
+ """AI model configuration"""
43
+ # Primary summarization model
44
+ primary_model: str = "facebook/bart-large-cnn"
45
+
46
+ # Fallback model for faster processing
47
+ fallback_model: str = "sshleifer/distilbart-cnn-12-6"
48
+
49
+ # Model parameters
50
+ max_input_length: int = 1024
51
+ max_summary_length: int = 500
52
+ min_summary_length: int = 30
53
+
54
+ # Performance settings
55
+ device: str = "auto" # auto, cpu, cuda
56
+ batch_size: int = 1
57
+ use_fast_tokenizer: bool = True
58
+
59
+ @dataclass
60
+ class ScrapingConfig:
61
+ """Web scraping configuration"""
62
+ # Request settings
63
+ timeout: int = 15
64
+ max_retries: int = 3
65
+ retry_delay: int = 1
66
+
67
+ # User agent string
68
+ user_agent: str = "Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)"
69
+
70
+ # Content extraction
71
+ min_content_length: int = 100
72
+ max_content_length: int = 100000
73
+
74
+ # Robots.txt settings
75
+ respect_robots_txt: bool = True
76
+ robots_cache_duration: int = 3600 # seconds
77
+
78
+ @dataclass
79
+ class UIConfig:
80
+ """User interface configuration"""
81
+ # Default values
82
+ default_summary_length: int = 300
83
+ max_summary_length: int = 500
84
+ min_summary_length: int = 100
85
+
86
+ # Interface settings
87
+ enable_batch_processing: bool = True
88
+ max_batch_size: int = 10
89
+ show_advanced_options: bool = False
90
+
91
+ # Export settings
92
+ supported_export_formats: List[str] = None
93
+
94
+ def __post_init__(self):
95
+ if self.supported_export_formats is None:
96
+ self.supported_export_formats = ["CSV", "JSON"]
97
+
98
+ class Config:
99
+ """Main configuration class"""
100
+
101
+ def __init__(self):
102
+ self.security = SecurityConfig()
103
+ self.models = ModelConfig()
104
+ self.scraping = ScrapingConfig()
105
+ self.ui = UIConfig()
106
+
107
+ # Load from environment variables if available
108
+ self._load_from_env()
109
+
110
+ def _load_from_env(self):
111
+ """Load configuration from environment variables"""
112
+ # Security settings
113
+ if os.getenv('MAX_REQUESTS_PER_MINUTE'):
114
+ self.security.requests_per_minute = int(os.getenv('MAX_REQUESTS_PER_MINUTE'))
115
+
116
+ if os.getenv('MAX_CONTENT_SIZE'):
117
+ self.security.max_content_size = int(os.getenv('MAX_CONTENT_SIZE'))
118
+
119
+ # Model settings
120
+ if os.getenv('PRIMARY_MODEL'):
121
+ self.models.primary_model = os.getenv('PRIMARY_MODEL')
122
+
123
+ if os.getenv('FALLBACK_MODEL'):
124
+ self.models.fallback_model = os.getenv('FALLBACK_MODEL')
125
+
126
+ if os.getenv('DEVICE'):
127
+ self.models.device = os.getenv('DEVICE')
128
+
129
+ # Scraping settings
130
+ if os.getenv('REQUEST_TIMEOUT'):
131
+ self.scraping.timeout = int(os.getenv('REQUEST_TIMEOUT'))
132
+
133
+ if os.getenv('USER_AGENT'):
134
+ self.scraping.user_agent = os.getenv('USER_AGENT')
135
+
136
+ if os.getenv('RESPECT_ROBOTS_TXT'):
137
+ self.scraping.respect_robots_txt = os.getenv('RESPECT_ROBOTS_TXT').lower() == 'true'
138
+
139
+ def get_model_device(self) -> str:
140
+ """Get the appropriate device for model inference"""
141
+ if self.models.device == "auto":
142
+ try:
143
+ import torch
144
+ return "cuda" if torch.cuda.is_available() else "cpu"
145
+ except ImportError:
146
+ return "cpu"
147
+ return self.models.device
148
+
149
+ def is_url_allowed(self, url: str) -> bool:
150
+ """Check if URL is allowed based on security settings"""
151
+ from urllib.parse import urlparse
152
+
153
+ try:
154
+ parsed = urlparse(url)
155
+
156
+ # Check scheme
157
+ if parsed.scheme not in self.security.allowed_schemes:
158
+ return False
159
+
160
+ # Check blocked domains
161
+ hostname = parsed.hostname or ''
162
+ for blocked in self.security.blocked_domains:
163
+ if blocked in hostname:
164
+ return False
165
+
166
+ # Check URL length
167
+ if len(url) > self.security.max_url_length:
168
+ return False
169
+
170
+ return True
171
+
172
+ except Exception:
173
+ return False
174
+
175
+ def get_request_headers(self) -> Dict[str, str]:
176
+ """Get standard request headers"""
177
+ return {
178
+ 'User-Agent': self.scraping.user_agent,
179
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
180
+ 'Accept-Language': 'en-US,en;q=0.5',
181
+ 'Accept-Encoding': 'gzip, deflate',
182
+ 'Connection': 'keep-alive',
183
+ 'Upgrade-Insecure-Requests': '1',
184
+ }
185
+
186
+ # Global configuration instance
187
+ config = Config()
188
+
189
+ # Environment-specific overrides for Hugging Face Spaces
190
+ if os.getenv('SPACE_ID'):
191
+ # Running on Hugging Face Spaces
192
+ config.models.device = "auto"
193
+ config.security.requests_per_minute = 20 # More conservative on shared infrastructure
194
+ config.scraping.timeout = 10 # Shorter timeout on shared infrastructure
195
+
196
+ # Enable GPU if available
197
+ if os.getenv('CUDA_VISIBLE_DEVICES'):
198
+ config.models.device = "cuda"
199
+
200
+ # Development mode overrides
201
+ if os.getenv('ENVIRONMENT') == 'development':
202
+ config.security.requests_per_minute = 100
203
+ config.scraping.timeout = 30
204
+ config.ui.show_advanced_options = True