Shreyas094 commited on
Commit
7622b2d
·
verified ·
1 Parent(s): 2e0de7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -8
app.py CHANGED
@@ -129,19 +129,36 @@ Classify as "knowledge_base" if the query:
129
  logger.error(f'Error determining query type: {e}. Defaulting to knowledge_base')
130
  return QueryType.KNOWLEDGE_BASE
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  def is_valid_url(url: str) -> bool:
133
- """Check if the provided string is a valid URL."""
134
  try:
135
  result = urlparse(url)
136
- return all([result.scheme, result.netloc])
137
- except:
 
 
 
 
 
 
138
  return False
139
 
140
- def extract_urls(text: str) -> List[str]:
141
- """Extract URLs from text using regex pattern."""
142
- url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
143
- return re.findall(url_pattern, text)
144
-
145
  async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
146
  """Handle queries that can be answered from the knowledge base, with context."""
147
  logger.info(f'Processing knowledge base query: {query}')
 
129
  logger.error(f'Error determining query type: {e}. Defaulting to knowledge_base')
130
  return QueryType.KNOWLEDGE_BASE
131
 
132
+ def extract_urls(text: str) -> List[str]:
133
+ """Extract URLs from text using an improved regex pattern."""
134
+ # Updated regex pattern to better handle complex URLs with query parameters and paths
135
+ url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+(?:/[^)\s]*)?'
136
+ urls = re.findall(url_pattern, text)
137
+
138
+ # Clean and validate found URLs
139
+ valid_urls = []
140
+ for url in urls:
141
+ # Remove trailing punctuation or artifacts that might have been captured
142
+ url = url.rstrip('.,;:)')
143
+ if is_valid_url(url):
144
+ valid_urls.append(url)
145
+
146
+ return valid_urls
147
+
148
  def is_valid_url(url: str) -> bool:
149
+ """Check if the provided string is a valid URL with enhanced validation."""
150
  try:
151
  result = urlparse(url)
152
+ # Check for both scheme and netloc (domain)
153
+ has_valid_scheme = result.scheme in ('http', 'https')
154
+ has_valid_domain = bool(result.netloc)
155
+ # Additional validation to ensure complete URL structure
156
+ is_complete = all([has_valid_scheme, has_valid_domain])
157
+ return is_complete
158
+ except Exception as e:
159
+ logger.error(f'URL validation error: {e}')
160
  return False
161
 
 
 
 
 
 
162
  async def process_knowledge_base_query(query: str, chat_history: List[List[str]], temperature: float = 0.7) -> str:
163
  """Handle queries that can be answered from the knowledge base, with context."""
164
  logger.info(f'Processing knowledge base query: {query}')