Spaces:
Sleeping
Sleeping
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import datetime | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem import WordNetLemmatizer | |
from nltk.tokenize import word_tokenize | |
from gensim.models import LdaModel | |
from gensim.corpora import Dictionary | |
from textblob import TextBlob | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
import networkx as nx | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from scipy import linalg | |
import plotly.graph_objects as go | |
from collections import Counter | |
import warnings | |
import transformers | |
import gradio as gr | |
import streamlit as st | |
warnings.filterwarnings("ignore") | |
# Set up logging | |
import logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Function to fetch HTML content from GitHub issue pages | |
def fetch_issue_data(username, repository, start_page, end_page): | |
issues_data = [] | |
for page in range(start_page, end_page + 1): | |
url = f"https://github.com/{username}/{repository}/issues?page={page}" | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
issue_elements = soup.find_all('div', class_='flex-shrink-0') | |
for issue_element in issue_elements: | |
issue_link = issue_element.find('a', class_='Link--primary')['href'] | |
issue_url = f"https://github.com{issue_link}" | |
issue_data = fetch_issue_details(issue_url) | |
issues_data.append(issue_data) | |
return issues_data | |
# Function to fetch details of a specific issue | |
def fetch_issue_details(issue_url): | |
response = requests.get(issue_url) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
issue_title = soup.find('h1', class_='gh-header-title').text.strip() | |
issue_body = soup.find('div', class_='markdown-body').text.strip() | |
issue_created_at = soup.find('relative-time')['datetime'] | |
issue_closed_at = soup.find('relative-time', class_='no-wrap') | |
if issue_closed_at: | |
issue_closed_at = issue_closed_at['datetime'] | |
else: | |
issue_closed_at = None | |
issue_author = soup.find('a', class_='author').text.strip() | |
issue_assignee = soup.find('a', class_='Link--muted') | |
if issue_assignee: | |
issue_assignee = issue_assignee.text.strip() | |
else: | |
issue_assignee = None | |
return { | |
'title': issue_title, | |
'body': issue_body, | |
'created_at': issue_created_at, | |
'closed_at': issue_closed_at, | |
'author': issue_author, | |
'assignee': issue_assignee | |
} | |
# Function to clean and structure the data | |
def clean_and_structure_data(issues_data): | |
df = pd.DataFrame(issues_data) | |
if 'created_at' in df.columns: | |
df['created_at'] = pd.to_datetime(df['created_at']) | |
else: | |
logging.error("The 'created_at' column is missing from the dataframe.") | |
df['created_at'] = pd.NaT | |
if 'closed_at' in df.columns: | |
df['closed_at'] = pd.to_datetime(df['closed_at']) | |
else: | |
df['closed_at'] = None | |
df['resolution_time'] = (df['closed_at'] - df['created_at']).dt.days | |
df['resolution_time'] = df['resolution_time'].fillna(-1) | |
df['is_closed'] = (df['closed_at'].notna()).astype(int) | |
return df | |
# Function for exploratory data analysis (EDA) | |
def perform_eda(df): | |
# Descriptive statistics | |
st.write(df.describe()) | |
# Visualizations | |
sns.histplot(df['resolution_time'], kde=True) | |
st.pyplot(plt) | |
sns.lineplot(x=df['created_at'].dt.month, y='resolution_time', data=df) | |
st.pyplot(plt) | |
top_authors = df['author'].value_counts().nlargest(10) | |
st.write("\nTop 10 Authors:") | |
st.write(top_authors) | |
top_assignees = df['assignee'].value_counts().nlargest(10) | |
st.write("\nTop 10 Assignees:") | |
st.write(top_assignees) | |
# Function for text analysis using NLP | |
def analyze_text_content(df): | |
# Text preprocessing | |
stop_words = set(stopwords.words('english')) | |
lemmatizer = WordNetLemmatizer() | |
df['processed_body'] = df['body'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word.lower() not in stop_words])) | |
# Topic modeling | |
dictionary = Dictionary([word_tokenize(text) for text in df['processed_body']]) | |
corpus = [dictionary.doc2bow(word_tokenize(text)) for text in df['processed_body']] | |
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary) | |
st.write("Top 5 Topics:") | |
for topic in lda_model.print_topics(num_words=5): | |
st.write(topic) | |
# Sentiment analysis | |
analyzer = SentimentIntensityAnalyzer() | |
df['sentiment'] = df['body'].apply(lambda text: analyzer.polarity_scores(text)['compound']) | |
st.write("Sentiment Analysis:") | |
st.write(df['sentiment'].describe()) | |
# Word Cloud for Common Words | |
from wordcloud import WordCloud | |
all_words = ' '.join([text for text in df['processed_body']]) | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words) | |
st.pyplot(plt.figure(figsize=(10, 6), facecolor=None)) | |
plt.imshow(wordcloud) | |
plt.axis("off") | |
plt.tight_layout(pad=0) | |
plt.show() | |
# Function to create a network graph of issues, authors, and assignees | |
def create_network_graph(df): | |
graph = nx.Graph() | |
for index, row in df.iterrows(): | |
graph.add_node(row['title'], type='issue') | |
graph.add_node(row['author'], type='author') | |
if row['assignee']: | |
graph.add_node(row['assignee'], type='assignee') | |
graph.add_edge(row['title'], row['author']) | |
if row['assignee']: | |
graph.add_edge(row['title'], row['assignee']) | |
... | |
# Interactive Network Graph with Plotly | |
pos = nx.spring_layout(graph, k=0.5) | |
edge_x = [] | |
edge_y = [] | |
for edge in graph.edges(): | |
x0, y0 = pos[edge[0]] | |
x1, y1 = pos[edge[1]] | |
edge_x.append([x0, x1, None]) | |
edge_y.append([y0, y1, None]) | |
edge_trace = go.Scatter( | |
x=edge_x, | |
y=edge_y, | |
line=dict(width=0.5, color='#888'), | |
hoverinfo='none', | |
mode='lines' | |
) | |
node_x = [] | |
node_y = [] | |
for node in graph.nodes(): | |
x, y = pos[node] | |
node_x.append(x) | |
node_y.append(y) | |
node_trace = go.Scatter( | |
x=node_x, | |
y=node_y, | |
mode='markers', | |
marker=dict( | |
color=[], | |
size=10, | |
line=dict(width=2, color='black') | |
), | |
text=[], | |
hoverinfo='text' | |
) | |
# Set node colors based on type | |
node_colors = [] | |
for node in graph.nodes(): | |
if graph.nodes[node]['type'] == 'issue': | |
node_colors.append('red') | |
elif graph.nodes[node]['type'] == 'author': | |
node_colors.append('blue') | |