rameshmoorthy commited on
Commit
b4ac1bd
·
verified ·
1 Parent(s): b6558e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +390 -113
app.py CHANGED
@@ -5,82 +5,73 @@ from selenium import webdriver
5
  from selenium.webdriver.common.by import By
6
  from selenium.webdriver.support.ui import WebDriverWait
7
  from selenium.webdriver.support import expected_conditions as EC
8
- from selenium.webdriver.chrome.service import Service as ChromeService
9
- from selenium.webdriver.chrome.options import Options as ChromeOptions
 
 
 
10
  from geopy.geocoders import Nominatim, ArcGIS
11
  from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError
12
  import time
13
  import pandas as pd
14
  import re
15
  import os
16
- import shutil # For finding chromedriver
17
 
18
  def driversetup_huggingface():
19
- """Custom driver setup for Hugging Face Spaces (headless)."""
20
- options = ChromeOptions()
21
  options.add_argument("--headless")
22
- options.add_argument("--no-sandbox")
23
- # options.add_argument("--disable-gpu")
24
- # options.add_argument("--window-size=1920,1080")
25
- options.add_argument("--disable-dev-shm-usage")
26
- # options.add_argument("lang=en")
27
- # options.add_argument("start-maximized")
28
- # options.add_argument("disable-infobars")
29
- # options.add_argument("--disable-extensions")
30
- # options.add_argument("--disable-blink-features=AutomationControlled")
31
- options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
32
-
33
- # Attempt to find chromedriver - Hugging Face Spaces might have it in specific locations
34
- # or it might need to be installed via packages.txt or a Dockerfile.
35
- # For Gradio apps on Spaces, it's often pre-configured or easily installable.
36
- # Let's try common paths or rely on it being in PATH.
37
-
38
- # Check if chromedriver is in PATH or use a common location
39
- chromedriver_path = shutil.which("chromedriver")
40
- if chromedriver_path:
41
- print(f"Using chromedriver found at: {chromedriver_path}")
42
- service = ChromeService(executable_path=chromedriver_path)
43
- else:
44
- # Fallback if not in PATH - this might fail on HF if not installed correctly
45
- print("Chromedriver not found in PATH. Attempting to use 'chromedriver' directly (might fail).")
46
- print("For Hugging Face Spaces, ensure Chrome & Chromedriver are available in the environment.")
47
- print("You might need to add 'chromium-chromedriver' to a packages.txt file if using a Docker Space.")
48
- # As a last resort, try initializing without explicit path, hoping Selenium finds it.
49
- # This part is crucial for HF deployment and might need adjustment based on the HF Space environment.
50
- # For many Gradio spaces, simply having 'selenium' and 'chromedriver-binary' (or similar)
51
- # in requirements.txt might work if the base image is well-configured.
52
- # However, for full Chrome, system-level install is better.
53
- # For now, we'll proceed assuming it might be found or will error out gracefully.
54
- try:
55
- # This assumes chromedriver is globally available or Selenium can find it.
56
- # On Hugging Face, if using default Docker runtime, you might need to specify
57
- # apt packages like 'chromium-driver' or 'google-chrome-stable' + 'chromedriver'
58
- # in a packages.txt file or use a custom Dockerfile.
59
- # For simplicity, let's assume it can be found or will fail here.
60
- # A common path if installed via apt in a container:
61
- if os.path.exists("/usr/bin/chromedriver"):
62
- service = ChromeService(executable_path="/usr/bin/chromedriver")
63
- elif os.path.exists("/usr/local/bin/chromedriver"):
64
- service = ChromeService(executable_path="/usr/local/bin/chromedriver")
65
- else:
66
- # This will likely fail if chromedriver isn't installed and in PATH
67
- # On HF Spaces, you typically ensure this via environment setup (e.g. packages.txt)
68
- print("Attempting to initialize ChromeService without explicit path...")
69
- service = ChromeService() # May fail if chromedriver not in PATH
70
- except Exception as e:
71
- print(f"Could not initialize ChromeService: {e}. Ensure chromedriver is installed and in PATH.")
72
- return None
73
-
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  try:
76
- print("Setting up ChromeDriver for Hugging Face environment...")
77
- driver = webdriver.Chrome(service=service, options=options)
78
- print("ChromeDriver setup successful.")
79
- except Exception as e:
80
- print(f"Error setting up ChromeDriver: {e}")
 
 
 
 
 
 
 
 
 
 
81
  return None
82
 
83
- driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
 
 
84
  return driver
85
 
86
  def clean_address(address_str):
@@ -118,71 +109,71 @@ def geocode_address_with_fallbacks(address_str, attempt_count=0):
118
  print(f"{name} geocoding error: {e}")
119
  except Exception as e:
120
  print(f"An unexpected error occurred with {name}: {e}")
121
- time.sleep(1)
122
- if attempt_count == 0:
123
  parts = [s.strip() for s in cleaned_address.split(',') if s.strip()]
124
  if len(parts) > 3:
125
- generic_address = ', '.join(parts[1:])
126
- print(f"Trying a more generic address (v1): '{generic_address}'")
127
- lat, lon = geocode_address_with_fallbacks(generic_address, attempt_count + 1)
128
- if lat is not None: return lat, lon
129
- if len(parts) > 4:
130
- generic_address_v2 = ', '.join(parts[2:])
131
- print(f"Trying a more generic address (v2): '{generic_address_v2}'")
132
- return geocode_address_with_fallbacks(generic_address_v2, attempt_count + 1)
133
  print("All geocoding attempts failed for the address.")
134
  return None, None
135
 
136
  def get_gstin_details_for_gradio(gstin_number_input):
137
- """
138
- Main function for Gradio: takes GSTIN, scrapes, and returns data as DataFrame.
139
- """
140
  gstin_number = str(gstin_number_input).strip().upper()
141
  if not (len(gstin_number) == 15 and gstin_number.isalnum()):
142
  return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]})
143
 
144
  print(f"Initiating scraper for GSTIN: {gstin_number}")
145
- driver = driversetup_huggingface()
146
 
147
  if driver is None:
148
- print("WebDriver not initialized for scraper.")
149
- return pd.DataFrame({"Error": ["WebDriver initialization failed. Check server logs."]})
150
 
151
  extracted_data = {"GSTIN Queried": gstin_number}
152
- wait_time = 30
153
  url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/"
154
 
155
  try:
 
156
  driver.get(url)
157
- print(f"Navigated to URL: {url}")
158
 
159
  gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]'
160
- WebDriverWait(driver, wait_time).until(
161
- EC.presence_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector))
 
162
  )
163
- gstin_input = driver.find_element(By.CSS_SELECTOR, gstin_input_css_selector)
164
  gstin_input.clear()
165
  gstin_input.send_keys(gstin_number)
166
  print(f"Entered GSTIN: {gstin_number}")
 
167
 
168
  search_button_css_selector = 'button[aria-label="Search"]'
169
- WebDriverWait(driver, wait_time).until(
 
170
  EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector))
171
  )
172
- search_button = driver.find_element(By.CSS_SELECTOR, search_button_css_selector)
 
 
173
  driver.execute_script("arguments[0].click();", search_button)
174
- print("Clicked Search button.")
175
 
176
- results_table_container_css_selector_for_wait = "div.eaKoeQ table"
 
177
  WebDriverWait(driver, wait_time).until(
178
- EC.presence_of_element_located((By.CSS_SELECTOR, results_table_container_css_selector_for_wait))
179
  )
180
- print("Results table container found.")
181
- time.sleep(4)
182
 
183
  page_source = driver.page_source
184
  soup = BeautifulSoup(page_source, 'html.parser')
185
-
186
  table_container_div = soup.select_one("div.eaKoeQ")
187
  table = None
188
  if table_container_div: table = table_container_div.find('table')
@@ -198,19 +189,23 @@ def get_gstin_details_for_gradio(gstin_number_input):
198
 
199
  rows = table.find_all('tr')
200
  raw_data = {}
201
- for row in rows:
 
 
 
 
202
  header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split())
203
  value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split())
204
  if header_element and value_element:
205
  raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True)
206
  elif len(row.find_all('td')) == 2:
207
  cells = row.find_all('td')
208
- if cells[0].get_text(strip=True):
209
- raw_data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
210
 
211
  if not raw_data:
212
- print("Could not parse any data from the table rows.")
213
- return pd.DataFrame({"Error": ["Failed to parse data from table."]})
214
 
215
  fields_to_extract_map = {
216
  "Principal Place of Business": "Principal Business Address",
@@ -237,22 +232,19 @@ def get_gstin_details_for_gradio(gstin_number_input):
237
  print("Principal Place of Business not found or empty, skipping geocoding.")
238
 
239
  print(f"Successfully scraped data for {gstin_number}")
240
- # Convert dictionary to a 2-column DataFrame for Gradio
241
  df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"])
242
  return df_output
243
 
244
  except Exception as e:
245
  print(f"An error occurred during scraping process for {gstin_number}: {e}")
246
- # import traceback
247
- # traceback.print_exc()
248
  return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]})
249
  finally:
250
  if 'driver' in locals() and driver is not None:
251
  try:
252
  driver.quit()
253
  print("Browser closed.")
254
- except Exception as e:
255
- print(f"Error quitting driver: {e}")
256
 
257
  # --- Gradio Interface ---
258
  iface = gr.Interface(
@@ -268,20 +260,305 @@ iface = gr.Interface(
268
  headers=["Field", "Value"],
269
  wrap=True
270
  ),
271
- title="🧾 GSTIN Details Scraper & Verifier",
272
- description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping.",
273
- article="<p style='text-align: center;'>Powered by Selenium, BeautifulSoup, Geopy, and Gradio. <br>Note: Scraping may take 20-40 seconds. Geocoding accuracy may vary.</p>",
274
- examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], # Example GSTINs
275
  allow_flagging="never",
276
- theme=gr.themes.Soft() # Using a soft theme
277
  )
278
 
279
  if __name__ == '__main__':
280
- # For Hugging Face Spaces, Gradio typically handles the server.
281
- # This launch(share=True) is more for local testing if you want a public link temporarily.
282
- # On HF Spaces, just `iface.launch()` is enough.
283
- # To run locally: python app.py
284
- if os.environ.get("SYSTEM") == "spaces": # Check if running in Hugging Face Spaces
285
  iface.launch(debug=False)
286
  else:
287
- iface.launch(debug=True, share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from selenium.webdriver.common.by import By
6
  from selenium.webdriver.support.ui import WebDriverWait
7
  from selenium.webdriver.support import expected_conditions as EC
8
+ # Import Firefox specific classes
9
+ from selenium.webdriver.firefox.service import Service as FirefoxService
10
+ from selenium.webdriver.firefox.options import Options as FirefoxOptions
11
+ # from selenium.webdriver.chrome.service import Service as ChromeService # No longer needed
12
+ # from selenium.webdriver.chrome.options import Options as ChromeOptions # No longer needed
13
  from geopy.geocoders import Nominatim, ArcGIS
14
  from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError
15
  import time
16
  import pandas as pd
17
  import re
18
  import os
19
+ import shutil # For finding geckodriver
20
 
21
  def driversetup_huggingface():
22
+ """Custom driver setup for Hugging Face Spaces using Firefox (headless)."""
23
+ options = FirefoxOptions()
24
  options.add_argument("--headless")
25
+ options.add_argument("--window-size=1920,1080") # Set a reasonable window size
26
+ options.add_argument("--disable-gpu") # Often recommended for headless
27
+ # Firefox doesn't use --no-sandbox or --disable-dev-shm-usage in the same way as Chrome
28
+ # User agent and other settings
29
+ options.set_preference("intl.accept_languages", "en-US, en")
30
+ options.set_preference("general.useragent.override", "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0") # Example Firefox UA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ geckodriver_path = shutil.which("geckodriver")
33
+ service = None
34
+ if geckodriver_path:
35
+ print(f"Using geckodriver found at: {geckodriver_path}")
36
+ service = FirefoxService(executable_path=geckodriver_path)
37
+ else:
38
+ print("Geckodriver not found in PATH by shutil.which.")
39
+ common_paths = ["/usr/bin/geckodriver", "/usr/local/bin/geckodriver"]
40
+ for path in common_paths:
41
+ if os.path.exists(path):
42
+ print(f"Found geckodriver at common path: {path}")
43
+ service = FirefoxService(executable_path=path)
44
+ break
45
+ if not service:
46
+ print("Geckodriver not found in common paths. Attempting to initialize FirefoxService without explicit path...")
47
+ print("Ensure 'firefox-esr' and 'geckodriver' are in packages.txt for HF Spaces.")
48
+ try:
49
+ # This will likely fail if geckodriver isn't installed and in PATH
50
+ service = FirefoxService()
51
+ except Exception as e_service:
52
+ print(f"Could not initialize FirefoxService without explicit path: {e_service}")
53
+ return None
54
  try:
55
+ print("Setting up GeckoDriver (Firefox) for Hugging Face environment...")
56
+ driver = webdriver.Firefox(service=service, options=options)
57
+ print("GeckoDriver (Firefox) setup successful.")
58
+ except Exception as e_webdriver:
59
+ print(f"Error setting up GeckoDriver (Firefox): {e_webdriver}")
60
+ if service and service.path: # Check if service.path exists
61
+ # geckodriver might not have a simple --version flag like chromedriver
62
+ # We can try to run it to see if it executes
63
+ try:
64
+ os.system(f"{service.path} --version > geckodriver_version.txt 2>&1")
65
+ with open("geckodriver_version.txt", "r") as f:
66
+ print(f"Geckodriver version check output: {f.read()}")
67
+ os.remove("geckodriver_version.txt")
68
+ except Exception as e_ver:
69
+ print(f"Could not execute geckodriver version check: {e_ver}")
70
  return None
71
 
72
+ # The AutomationControlled blink feature is Chrome-specific.
73
+ # For Firefox, such measures are less common or handled differently.
74
+ # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});") # This is Chrome specific
75
  return driver
76
 
77
  def clean_address(address_str):
 
109
  print(f"{name} geocoding error: {e}")
110
  except Exception as e:
111
  print(f"An unexpected error occurred with {name}: {e}")
112
+ time.sleep(1) # Be respectful to APIs
113
+ if attempt_count == 0: # Try a more generic address only once
114
  parts = [s.strip() for s in cleaned_address.split(',') if s.strip()]
115
  if len(parts) > 3:
116
+ start_index = max(0, len(parts) - 4)
117
+ generic_address = ', '.join(parts[start_index:])
118
+ print(f"Trying a more generic address: '{generic_address}'")
119
+ return geocode_address_with_fallbacks(generic_address, attempt_count + 1)
120
+
 
 
 
121
  print("All geocoding attempts failed for the address.")
122
  return None, None
123
 
124
  def get_gstin_details_for_gradio(gstin_number_input):
 
 
 
125
  gstin_number = str(gstin_number_input).strip().upper()
126
  if not (len(gstin_number) == 15 and gstin_number.isalnum()):
127
  return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]})
128
 
129
  print(f"Initiating scraper for GSTIN: {gstin_number}")
130
+ driver = driversetup_huggingface() # Now uses Firefox setup
131
 
132
  if driver is None:
133
+ print("WebDriver (Firefox) not initialized for scraper.")
134
+ return pd.DataFrame({"Error": ["WebDriver (Firefox) initialization failed. Check server logs for GeckoDriver errors."]})
135
 
136
  extracted_data = {"GSTIN Queried": gstin_number}
137
+ wait_time = 35
138
  url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/"
139
 
140
  try:
141
+ print(f"Navigating to URL: {url}")
142
  driver.get(url)
143
+ time.sleep(1.5) # Slightly longer pause for Firefox initial page load
144
 
145
  gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]'
146
+ print(f"Waiting for GSTIN input box: {gstin_input_css_selector}")
147
+ gstin_input = WebDriverWait(driver, wait_time).until(
148
+ EC.visibility_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector))
149
  )
150
+ print("GSTIN input box visible.")
151
  gstin_input.clear()
152
  gstin_input.send_keys(gstin_number)
153
  print(f"Entered GSTIN: {gstin_number}")
154
+ time.sleep(0.5)
155
 
156
  search_button_css_selector = 'button[aria-label="Search"]'
157
+ print(f"Waiting for Search button: {search_button_css_selector}")
158
+ search_button = WebDriverWait(driver, wait_time).until(
159
  EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector))
160
  )
161
+ print("Search button clickable.")
162
+ driver.execute_script("arguments[0].scrollIntoView(true);", search_button)
163
+ time.sleep(0.5)
164
  driver.execute_script("arguments[0].click();", search_button)
165
+ print("Clicked Search button using JavaScript.")
166
 
167
+ results_table_css_selector = "div.eaKoeQ table tbody tr"
168
+ print(f"Waiting for results table rows: {results_table_css_selector}")
169
  WebDriverWait(driver, wait_time).until(
170
+ EC.presence_of_all_elements_located((By.CSS_SELECTOR, results_table_css_selector))
171
  )
172
+ print("Results table rows are present.")
173
+ time.sleep(3)
174
 
175
  page_source = driver.page_source
176
  soup = BeautifulSoup(page_source, 'html.parser')
 
177
  table_container_div = soup.select_one("div.eaKoeQ")
178
  table = None
179
  if table_container_div: table = table_container_div.find('table')
 
189
 
190
  rows = table.find_all('tr')
191
  raw_data = {}
192
+ if not rows:
193
+ print("Table found, but no rows (<tr>) parsed from it.")
194
+ return pd.DataFrame({"Error": ["Data table found but no rows could be parsed."]})
195
+
196
+ for row_num, row in enumerate(rows):
197
  header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split())
198
  value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split())
199
  if header_element and value_element:
200
  raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True)
201
  elif len(row.find_all('td')) == 2:
202
  cells = row.find_all('td')
203
+ key = cells[0].get_text(strip=True)
204
+ if key: raw_data[key] = cells[1].get_text(strip=True)
205
 
206
  if not raw_data:
207
+ print("Could not parse any key-value data from the table rows.")
208
+ return pd.DataFrame({"Error": ["Failed to parse key-value data from table rows."]})
209
 
210
  fields_to_extract_map = {
211
  "Principal Place of Business": "Principal Business Address",
 
232
  print("Principal Place of Business not found or empty, skipping geocoding.")
233
 
234
  print(f"Successfully scraped data for {gstin_number}")
 
235
  df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"])
236
  return df_output
237
 
238
  except Exception as e:
239
  print(f"An error occurred during scraping process for {gstin_number}: {e}")
 
 
240
  return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]})
241
  finally:
242
  if 'driver' in locals() and driver is not None:
243
  try:
244
  driver.quit()
245
  print("Browser closed.")
246
+ except Exception as e_quit:
247
+ print(f"Error quitting driver: {e_quit}")
248
 
249
  # --- Gradio Interface ---
250
  iface = gr.Interface(
 
260
  headers=["Field", "Value"],
261
  wrap=True
262
  ),
263
+ title="🧾 GSTIN Details Scraper & Verifier (Firefox Edition)",
264
+ description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping (with Firefox/GeckoDriver).",
265
+ article="<p style='text-align: center;'>Powered by Selenium, BeautifulSoup, Geopy, and Gradio. <br>Note: Scraping may take 20-45 seconds. Geocoding accuracy may vary.</p>",
266
+ examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]],
267
  allow_flagging="never",
268
+ theme=gr.themes.Soft()
269
  )
270
 
271
  if __name__ == '__main__':
272
+ if os.environ.get("SYSTEM") == "spaces":
 
 
 
 
273
  iface.launch(debug=False)
274
  else:
275
+ iface.launch(debug=True, share=False)
276
+
277
+ # webdriver-manager # Useful for local testing with Firefox too
278
+ # # app.py
279
+ # import gradio as gr
280
+ # from bs4 import BeautifulSoup
281
+ # from selenium import webdriver
282
+ # from selenium.webdriver.common.by import By
283
+ # from selenium.webdriver.support.ui import WebDriverWait
284
+ # from selenium.webdriver.support import expected_conditions as EC
285
+ # from selenium.webdriver.chrome.service import Service as ChromeService
286
+ # from selenium.webdriver.chrome.options import Options as ChromeOptions
287
+ # from geopy.geocoders import Nominatim, ArcGIS
288
+ # from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError
289
+ # import time
290
+ # import pandas as pd
291
+ # import re
292
+ # import os
293
+ # import shutil # For finding chromedriver
294
+
295
+ # def driversetup_huggingface():
296
+ # """Custom driver setup for Hugging Face Spaces (headless)."""
297
+ # options = ChromeOptions()
298
+ # options.add_argument("--headless")
299
+ # options.add_argument("--no-sandbox")
300
+ # # options.add_argument("--disable-gpu")
301
+ # # options.add_argument("--window-size=1920,1080")
302
+ # options.add_argument("--disable-dev-shm-usage")
303
+ # # options.add_argument("lang=en")
304
+ # # options.add_argument("start-maximized")
305
+ # # options.add_argument("disable-infobars")
306
+ # # options.add_argument("--disable-extensions")
307
+ # # options.add_argument("--disable-blink-features=AutomationControlled")
308
+ # options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
309
+
310
+ # # Attempt to find chromedriver - Hugging Face Spaces might have it in specific locations
311
+ # # or it might need to be installed via packages.txt or a Dockerfile.
312
+ # # For Gradio apps on Spaces, it's often pre-configured or easily installable.
313
+ # # Let's try common paths or rely on it being in PATH.
314
+
315
+ # # Check if chromedriver is in PATH or use a common location
316
+ # chromedriver_path = shutil.which("chromedriver")
317
+ # if chromedriver_path:
318
+ # print(f"Using chromedriver found at: {chromedriver_path}")
319
+ # service = ChromeService(executable_path=chromedriver_path)
320
+ # else:
321
+ # # Fallback if not in PATH - this might fail on HF if not installed correctly
322
+ # print("Chromedriver not found in PATH. Attempting to use 'chromedriver' directly (might fail).")
323
+ # print("For Hugging Face Spaces, ensure Chrome & Chromedriver are available in the environment.")
324
+ # print("You might need to add 'chromium-chromedriver' to a packages.txt file if using a Docker Space.")
325
+ # # As a last resort, try initializing without explicit path, hoping Selenium finds it.
326
+ # # This part is crucial for HF deployment and might need adjustment based on the HF Space environment.
327
+ # # For many Gradio spaces, simply having 'selenium' and 'chromedriver-binary' (or similar)
328
+ # # in requirements.txt might work if the base image is well-configured.
329
+ # # However, for full Chrome, system-level install is better.
330
+ # # For now, we'll proceed assuming it might be found or will error out gracefully.
331
+ # try:
332
+ # # This assumes chromedriver is globally available or Selenium can find it.
333
+ # # On Hugging Face, if using default Docker runtime, you might need to specify
334
+ # # apt packages like 'chromium-driver' or 'google-chrome-stable' + 'chromedriver'
335
+ # # in a packages.txt file or use a custom Dockerfile.
336
+ # # For simplicity, let's assume it can be found or will fail here.
337
+ # # A common path if installed via apt in a container:
338
+ # if os.path.exists("/usr/bin/chromedriver"):
339
+ # service = ChromeService(executable_path="/usr/bin/chromedriver")
340
+ # elif os.path.exists("/usr/local/bin/chromedriver"):
341
+ # service = ChromeService(executable_path="/usr/local/bin/chromedriver")
342
+ # else:
343
+ # # This will likely fail if chromedriver isn't installed and in PATH
344
+ # # On HF Spaces, you typically ensure this via environment setup (e.g. packages.txt)
345
+ # print("Attempting to initialize ChromeService without explicit path...")
346
+ # service = ChromeService() # May fail if chromedriver not in PATH
347
+ # except Exception as e:
348
+ # print(f"Could not initialize ChromeService: {e}. Ensure chromedriver is installed and in PATH.")
349
+ # return None
350
+
351
+
352
+ # try:
353
+ # print("Setting up ChromeDriver for Hugging Face environment...")
354
+ # driver = webdriver.Chrome(service=service, options=options)
355
+ # print("ChromeDriver setup successful.")
356
+ # except Exception as e:
357
+ # print(f"Error setting up ChromeDriver: {e}")
358
+ # return None
359
+
360
+ # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
361
+ # return driver
362
+
363
+ # def clean_address(address_str):
364
+ # if not isinstance(address_str, str):
365
+ # return ""
366
+ # cleaned_address = ' '.join(address_str.split())
367
+ # cleaned_address = re.sub(r'floor-\s*[\w\s]+,?', '', cleaned_address, flags=re.IGNORECASE)
368
+ # cleaned_address = cleaned_address.replace(' ,', ',').replace(',,', ',')
369
+ # cleaned_address = ', '.join(filter(None, (s.strip() for s in cleaned_address.split(','))))
370
+ # if "india" not in cleaned_address.lower() and ("mumbai" in cleaned_address.lower() or "maharashtra" in cleaned_address.lower()):
371
+ # cleaned_address += ", India"
372
+ # return cleaned_address
373
+
374
+ # def geocode_address_with_fallbacks(address_str, attempt_count=0):
375
+ # if not address_str or not address_str.strip():
376
+ # print("Address string is empty, cannot geocode.")
377
+ # return None, None
378
+ # cleaned_address = clean_address(address_str)
379
+ # print(f"Attempting to geocode cleaned address: '{cleaned_address}' (Attempt {attempt_count + 1})")
380
+ # nominatim_user_agent = f"gstin_gradio_app_hf_{int(time.time())}"
381
+ # geocoders_to_try = [
382
+ # ("Nominatim", Nominatim(user_agent=nominatim_user_agent)),
383
+ # ("ArcGIS", ArcGIS(timeout=10))
384
+ # ]
385
+ # for name, geolocator in geocoders_to_try:
386
+ # try:
387
+ # print(f"Trying geocoder: {name}...")
388
+ # location = geolocator.geocode(cleaned_address, timeout=15)
389
+ # if location:
390
+ # print(f"Success with {name}: Lat: {location.latitude}, Lon: {location.longitude}")
391
+ # return location.latitude, location.longitude
392
+ # else:
393
+ # print(f"{name} could not geocode the address.")
394
+ # except (GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError) as e:
395
+ # print(f"{name} geocoding error: {e}")
396
+ # except Exception as e:
397
+ # print(f"An unexpected error occurred with {name}: {e}")
398
+ # time.sleep(1)
399
+ # if attempt_count == 0:
400
+ # parts = [s.strip() for s in cleaned_address.split(',') if s.strip()]
401
+ # if len(parts) > 3:
402
+ # generic_address = ', '.join(parts[1:])
403
+ # print(f"Trying a more generic address (v1): '{generic_address}'")
404
+ # lat, lon = geocode_address_with_fallbacks(generic_address, attempt_count + 1)
405
+ # if lat is not None: return lat, lon
406
+ # if len(parts) > 4:
407
+ # generic_address_v2 = ', '.join(parts[2:])
408
+ # print(f"Trying a more generic address (v2): '{generic_address_v2}'")
409
+ # return geocode_address_with_fallbacks(generic_address_v2, attempt_count + 1)
410
+ # print("All geocoding attempts failed for the address.")
411
+ # return None, None
412
+
413
+ # def get_gstin_details_for_gradio(gstin_number_input):
414
+ # """
415
+ # Main function for Gradio: takes GSTIN, scrapes, and returns data as DataFrame.
416
+ # """
417
+ # gstin_number = str(gstin_number_input).strip().upper()
418
+ # if not (len(gstin_number) == 15 and gstin_number.isalnum()):
419
+ # return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]})
420
+
421
+ # print(f"Initiating scraper for GSTIN: {gstin_number}")
422
+ # driver = driversetup_huggingface()
423
+
424
+ # if driver is None:
425
+ # print("WebDriver not initialized for scraper.")
426
+ # return pd.DataFrame({"Error": ["WebDriver initialization failed. Check server logs."]})
427
+
428
+ # extracted_data = {"GSTIN Queried": gstin_number}
429
+ # wait_time = 30
430
+ # url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/"
431
+
432
+ # try:
433
+ # driver.get(url)
434
+ # print(f"Navigated to URL: {url}")
435
+
436
+ # gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]'
437
+ # WebDriverWait(driver, wait_time).until(
438
+ # EC.presence_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector))
439
+ # )
440
+ # gstin_input = driver.find_element(By.CSS_SELECTOR, gstin_input_css_selector)
441
+ # gstin_input.clear()
442
+ # gstin_input.send_keys(gstin_number)
443
+ # print(f"Entered GSTIN: {gstin_number}")
444
+
445
+ # search_button_css_selector = 'button[aria-label="Search"]'
446
+ # WebDriverWait(driver, wait_time).until(
447
+ # EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector))
448
+ # )
449
+ # search_button = driver.find_element(By.CSS_SELECTOR, search_button_css_selector)
450
+ # driver.execute_script("arguments[0].click();", search_button)
451
+ # print("Clicked Search button.")
452
+
453
+ # results_table_container_css_selector_for_wait = "div.eaKoeQ table"
454
+ # WebDriverWait(driver, wait_time).until(
455
+ # EC.presence_of_element_located((By.CSS_SELECTOR, results_table_container_css_selector_for_wait))
456
+ # )
457
+ # print("Results table container found.")
458
+ # time.sleep(4)
459
+
460
+ # page_source = driver.page_source
461
+ # soup = BeautifulSoup(page_source, 'html.parser')
462
+
463
+ # table_container_div = soup.select_one("div.eaKoeQ")
464
+ # table = None
465
+ # if table_container_div: table = table_container_div.find('table')
466
+ # if not table: table = soup.find('table')
467
+
468
+ # if not table:
469
+ # msg = "No data table found on the page after search."
470
+ # if "captcha" in page_source.lower(): msg = "CAPTCHA detected during scraping."
471
+ # elif "No details found" in page_source or "Invalid GSTIN" in page_source:
472
+ # msg = f"No details found for GSTIN {gstin_number} or invalid GSTIN."
473
+ # print(msg)
474
+ # return pd.DataFrame({"Error": [msg]})
475
+
476
+ # rows = table.find_all('tr')
477
+ # raw_data = {}
478
+ # for row in rows:
479
+ # header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split())
480
+ # value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split())
481
+ # if header_element and value_element:
482
+ # raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True)
483
+ # elif len(row.find_all('td')) == 2:
484
+ # cells = row.find_all('td')
485
+ # if cells[0].get_text(strip=True):
486
+ # raw_data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True)
487
+
488
+ # if not raw_data:
489
+ # print("Could not parse any data from the table rows.")
490
+ # return pd.DataFrame({"Error": ["Failed to parse data from table."]})
491
+
492
+ # fields_to_extract_map = {
493
+ # "Principal Place of Business": "Principal Business Address",
494
+ # "Additional Place of Business": "Additional Business Address(es)",
495
+ # "State Jurisdiction": "State Jurisdiction",
496
+ # "Centre Jurisdiction": "Centre Jurisdiction",
497
+ # "Date of Registration": "Registration Date",
498
+ # "Constitution of Business": "Business Constitution",
499
+ # "Taxpayer Type": "Taxpayer Type",
500
+ # "GSTIN Status": "GSTIN Status"
501
+ # }
502
+ # for web_key, display_key in fields_to_extract_map.items():
503
+ # extracted_data[display_key] = raw_data.get(web_key, "Not Found")
504
+
505
+ # address_to_geocode = extracted_data.get("Principal Business Address")
506
+ # if address_to_geocode not in [None, "Not Found", ""]:
507
+ # lat, lon = geocode_address_with_fallbacks(address_to_geocode)
508
+ # extracted_data["Address Latitude"] = lat if lat is not None else "N/A"
509
+ # extracted_data["Address Longitude"] = lon if lon is not None else "N/A"
510
+ # else:
511
+ # extracted_data["Address Latitude"] = "N/A"
512
+ # extracted_data["Address Longitude"] = "N/A"
513
+ # if extracted_data.get("Principal Business Address"):
514
+ # print("Principal Place of Business not found or empty, skipping geocoding.")
515
+
516
+ # print(f"Successfully scraped data for {gstin_number}")
517
+ # # Convert dictionary to a 2-column DataFrame for Gradio
518
+ # df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"])
519
+ # return df_output
520
+
521
+ # except Exception as e:
522
+ # print(f"An error occurred during scraping process for {gstin_number}: {e}")
523
+ # # import traceback
524
+ # # traceback.print_exc()
525
+ # return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]})
526
+ # finally:
527
+ # if 'driver' in locals() and driver is not None:
528
+ # try:
529
+ # driver.quit()
530
+ # print("Browser closed.")
531
+ # except Exception as e:
532
+ # print(f"Error quitting driver: {e}")
533
+
534
+ # # --- Gradio Interface ---
535
+ # iface = gr.Interface(
536
+ # fn=get_gstin_details_for_gradio,
537
+ # inputs=gr.Textbox(
538
+ # label="Enter GSTIN",
539
+ # placeholder="Enter 15-character GSTIN (e.g., 27AAFCD5562R1Z5)",
540
+ # max_lines=1,
541
+ # info="The scraper will fetch details for the provided GSTIN from Masters India."
542
+ # ),
543
+ # outputs=gr.DataFrame(
544
+ # label="GSTIN Details",
545
+ # headers=["Field", "Value"],
546
+ # wrap=True
547
+ # ),
548
+ # title="🧾 GSTIN Details Scraper & Verifier",
549
+ # description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping.",
550
+ # article="<p style='text-align: center;'>Powered by Selenium, BeautifulSoup, Geopy, and Gradio. <br>Note: Scraping may take 20-40 seconds. Geocoding accuracy may vary.</p>",
551
+ # examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], # Example GSTINs
552
+ # allow_flagging="never",
553
+ # theme=gr.themes.Soft() # Using a soft theme
554
+ # )
555
+
556
+ # if __name__ == '__main__':
557
+ # # For Hugging Face Spaces, Gradio typically handles the server.
558
+ # # This launch(share=True) is more for local testing if you want a public link temporarily.
559
+ # # On HF Spaces, just `iface.launch()` is enough.
560
+ # # To run locally: python app.py
561
+ # if os.environ.get("SYSTEM") == "spaces": # Check if running in Hugging Face Spaces
562
+ # iface.launch(debug=False)
563
+ # else:
564
+ # iface.launch(debug=True, share=True)