Canstralian commited on
Commit
f13ce67
·
verified ·
1 Parent(s): 61d716e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -92
app.py CHANGED
@@ -1,11 +1,8 @@
1
  import streamlit as st
2
  import requests
3
- import re
4
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
- import torch
6
- import pandas as pd
7
- from datasets import Dataset
8
- from huggingface_hub import hf_api
9
 
10
  # Title and description
11
  st.title("OSINT Tool 🏢")
@@ -13,7 +10,7 @@ st.markdown("""
13
  This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs.
14
  It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**.
15
  """)
16
-
17
  # Sidebar for navigation
18
  st.sidebar.title("Navigation")
19
  app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"])
@@ -26,22 +23,13 @@ if app_mode == "GitHub Repository Analysis":
26
 
27
  if st.button("Analyze Repository"):
28
  if repo_owner and repo_name:
29
- try:
30
- response = requests.get(f"https://api.github.com/repos/{repo_owner}/{repo_name}")
31
- data = response.json()
32
-
33
- if response.status_code == 200:
34
- st.subheader("Repository Details")
35
- st.write(f"**Name**: {data['name']}")
36
- st.write(f"**Owner**: {data['owner']['login']}")
37
- st.write(f"**Stars**: {data['stargazers_count']}")
38
- st.write(f"**Forks**: {data['forks_count']}")
39
- st.write(f"**Language**: {data['language']}")
40
- st.write(f"**Description**: {data['description']}")
41
- else:
42
- st.error(f"Error: {data.get('message', 'Something went wrong with the request')}")
43
- except Exception as e:
44
- st.error(f"Error occurred: {e}")
45
  else:
46
  st.warning("Please enter both repository owner and name.")
47
 
@@ -52,20 +40,11 @@ elif app_mode == "URL Title Fetcher":
52
 
53
  if st.button("Fetch Title"):
54
  if url:
55
- try:
56
- response = requests.get(url)
57
- if response.status_code == 200:
58
- # Try to extract the title from the HTML
59
- match = re.search('<title>(.*?)</title>', response.text)
60
- if match:
61
- title = match.group(1)
62
- st.write(f"**Page Title**: {title}")
63
- else:
64
- st.warning("Title tag not found in the page")
65
- else:
66
- st.error(f"Failed to retrieve the page. Status code: {response.status_code}")
67
- except Exception as e:
68
- st.error(f"Error occurred: {e}")
69
  else:
70
  st.warning("Please enter a valid URL.")
71
 
@@ -76,58 +55,4 @@ elif app_mode == "Dataset Upload & Fine-Tuning":
76
  uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv")
77
 
78
  if uploaded_file is not None:
79
- # Load the CSV into a pandas DataFrame
80
- df = pd.read_csv(uploaded_file)
81
-
82
- # Display dataset preview
83
- st.subheader("Dataset Preview")
84
- st.write(df.head())
85
-
86
- # Convert CSV to Hugging Face dataset format
87
- dataset = Dataset.from_pandas(df)
88
-
89
- model_name = st.selectbox("Select model for fine-tuning", ["distilbert-base-uncased"])
90
-
91
- if st.button("Fine-tune Model"):
92
- if model_name:
93
- try:
94
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
95
- tokenizer = AutoTokenizer.from_pretrained(model_name)
96
-
97
- # Prepare the dataset
98
- def preprocess_function(examples):
99
- return tokenizer(examples['text'], truncation=True, padding=True)
100
-
101
- tokenized_datasets = dataset.map(preprocess_function, batched=True)
102
-
103
- # Fine-tuning setup (using Hugging Face Trainer for a complete setup)
104
- from transformers import Trainer, TrainingArguments
105
-
106
- training_args = TrainingArguments(
107
- output_dir="./results",
108
- evaluation_strategy="epoch",
109
- learning_rate=2e-5,
110
- per_device_train_batch_size=16,
111
- per_device_eval_batch_size=16,
112
- num_train_epochs=3,
113
- weight_decay=0.01,
114
- )
115
-
116
- trainer = Trainer(
117
- model=model,
118
- args=training_args,
119
- train_dataset=tokenized_datasets,
120
- eval_dataset=tokenized_datasets,
121
- )
122
-
123
- # Train the model
124
- trainer.train()
125
-
126
- st.success("Fine-tuning completed successfully!")
127
- except Exception as e:
128
- st.error(f"Error during fine-tuning: {e}")
129
- else:
130
- st.warning("Please select a model for fine-tuning.")
131
-
132
- else:
133
- st.warning("Please upload a dataset.")
 
1
  import streamlit as st
2
  import requests
3
+ from src.github_analysis import analyze_github_repo
4
+ from src.url_fetcher import fetch_url_title
5
+ from src.fine_tune_helpers import fine_tune_model
 
 
 
6
 
7
  # Title and description
8
  st.title("OSINT Tool 🏢")
 
10
  This tool performs **Open Source Intelligence (OSINT)** analysis on GitHub repositories and fetches titles from URLs.
11
  It also allows uploading datasets (CSV format) for fine-tuning models like **DistilBERT**.
12
  """)
13
+
14
  # Sidebar for navigation
15
  st.sidebar.title("Navigation")
16
  app_mode = st.sidebar.radio("Choose the mode", ["GitHub Repository Analysis", "URL Title Fetcher", "Dataset Upload & Fine-Tuning"])
 
23
 
24
  if st.button("Analyze Repository"):
25
  if repo_owner and repo_name:
26
+ repo_data = analyze_github_repo(repo_owner, repo_name)
27
+ if repo_data:
28
+ st.subheader("Repository Details")
29
+ for key, value in repo_data.items():
30
+ st.write(f"**{key}**: {value}")
31
+ else:
32
+ st.error("Failed to retrieve repository details.")
 
 
 
 
 
 
 
 
 
33
  else:
34
  st.warning("Please enter both repository owner and name.")
35
 
 
40
 
41
  if st.button("Fetch Title"):
42
  if url:
43
+ title = fetch_url_title(url)
44
+ if title:
45
+ st.write(f"**Page Title**: {title}")
46
+ else:
47
+ st.error("Failed to retrieve the page title.")
 
 
 
 
 
 
 
 
 
48
  else:
49
  st.warning("Please enter a valid URL.")
50
 
 
55
  uploaded_file = st.file_uploader("Upload a CSV file for fine-tuning", type="csv")
56
 
57
  if uploaded_file is not None:
58
+ fine_tune_model(uploaded_file)