subpixel commited on
Commit
d5761e7
·
1 Parent(s): 6679c3b

Add project

Browse files
Files changed (6) hide show
  1. .gitignore +9 -0
  2. LLM.py +75 -0
  3. README.md +125 -13
  4. app.py +104 -0
  5. email_utils.py +140 -0
  6. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ /myenv
2
+ __pycache__
3
+ cache
4
+ venv
5
+ .env
6
+ .vscode
7
+ llm.log
8
+ .upm
9
+ .pythonlibs
LLM.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import timeit
3
+ import json
4
+ import os
5
+ import torch
6
+ import streamlit as st
7
+ # This should stay above the import of transformers to have model downloaded in the same directory as the project
8
+ os.environ['TRANSFORMERS_CACHE'] = os.curdir + '/cache'
9
+ from transformers import pipeline
10
+
11
+
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ filename='llm.log',
15
+ filemode='a',
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
+
18
+
19
+ @st.cache_resource
20
+ def init():
21
+ summarizer = pipeline("summarization",
22
+ model="sshleifer/distilbart-cnn-12-6",
23
+ use_fast=True,
24
+ device=0 if torch.cuda.is_available() else -1
25
+ )
26
+ detector = pipeline(
27
+ "text-classification",
28
+ model="1aurent/distilbert-base-multilingual-cased-finetuned-email-spam",
29
+ use_fast=True)
30
+ tagger = pipeline("text2text-generation",
31
+ model="fabiochiu/t5-base-tag-generation",
32
+ use_fast=True)
33
+ return [summarizer, detector, tagger]
34
+
35
+
36
+ def summarize(prompt, summarizer):
37
+ start = timeit.default_timer()
38
+ summarized = summarizer(prompt[:2048], truncation=True)
39
+ stop = timeit.default_timer()
40
+ logging.info(f"Summary: {summarized}")
41
+ logging.info(f"Time taken to summarize: {stop - start}")
42
+
43
+ return summarized
44
+
45
+
46
+ def detect_spam(prompt, detector):
47
+ spam = detector(prompt[:2048], truncation=True)
48
+ return spam[0]['label']
49
+
50
+
51
+ def get_tags(prompt, tagger):
52
+ tags = tagger(prompt[:2048], truncation=True)
53
+ return tags
54
+
55
+
56
+ # if __name__ == "__main__":
57
+ # llm = Summarizer()
58
+
59
+ # summary = llm.summarize("""
60
+ # image.png
61
+
62
+
63
+ # Job Chahiye!?!?
64
+
65
+ # GDSC is here with another fantastic event
66
+ # DSA Busted
67
+ # This event will teach you about DATA STRUCTURES AND ALGORITHMS, as well as how to tackle coding rounds.
68
+ # Every Saturday, we will have live doubt sessions.
69
+ # Every Sunday, we will have a quiz.
70
+ # CERTIFICATE and Exciting GOODIES from GOOGLE.
71
+
72
+ # So, don't pass up this excellent opportunity to begin or fast track your placement preparations.
73
+
74
+ # """)
75
+ # print(summary)
README.md CHANGED
@@ -1,13 +1,125 @@
1
- ---
2
- title: Mailsensei
3
- emoji: 📊
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.28.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 Introducing MailSensei
2
+ ## Your Cutting-Edge Email Management Companion 📧✂️
3
+
4
+ <!-- <img src="https://i.ibb.co/Bg6h8qp/your-image.png" alt="MailSensei Image"> -->
5
+ <!-- ![Image](./PeerlistHackathon_CoverImg.png) -->
6
+
7
+
8
+ ![Cover Image](https://raw.githubusercontent.com/Deven1902/Peerlist-Hackathon/main/PeerlistHackathon_CoverImg.png)
9
+
10
+ Tired of sifting through endless emails, only to discover they're not worth your time? Ever missed important messages due to email overload? In today's digital age, our inboxes are overflowing with both vital and distracting content. You might spend precious time on lengthy, irrelevant emails. MailSensei is here to revolutionize your email experience by addressing these challenges. 🚀
11
+
12
+ ### Need for a Solution 🤔
13
+
14
+ ![Email Overload](https://media.giphy.com/media/OothRHNJSCaTS/giphy.gif)
15
+
16
+ CEOs and business professionals often receive hundreds of emails daily, making efficient email management essential. MailSensei offers a solution to navigate this email overload, ensuring that crucial messages are never missed and time is spent more productively. 🕒💼
17
+
18
+ ### How to run the project 🌟
19
+ - Visit the following URL to locate the project:- **[https://replit.com/@DevenNandapurka/Peerlist-Hackathon](https://replit.com/@DevenNandapurka/Peerlist-Hackathon)**
20
+
21
+ - You will see the replit workspace. Click the `Run` button.
22
+
23
+ - You need to setup an **App Password** before you can set credentails for MailSensei.
24
+ Here we have attached an article to guide you through setting up an app password -> **[Generate your App Password](https://support.google.com/mail/answer/185833?hl=en)**
25
+
26
+ - Once the project is up and running, enter your credentials in the project and click the `Set Credentials` button.
27
+
28
+ - Click the `Fetch Emails` button -> and all of your unread emails will be fetched and rendered in a short summary format.
29
+
30
+ ### Our Unique Approach 🌟
31
+
32
+ - **Gmail Credentials Setup:** Securely collect your "Gmail Address" and "App Password" via Streamlit to maintain the confidentiality of your email data. 🔒
33
+
34
+ - **IMAP Connection:** Establish a connection to your Gmail account using IMAP (Internet Message Access Protocol) for seamless email access. 🌐
35
+
36
+ - **Email Decoding:** Decode your emails to extract essential information, including "From," "Subject," and email content. 📤
37
+
38
+ - **Email Summarization:** Utilize Large Language Models (LLMs), specifically "distilbart-cnn-12-6," to generate concise summaries of your emails, saving you valuable time. 📝⏳
39
+
40
+ - **Tag Generation:** Intelligently categorize and organize emails using "t5-base-tag-generation." Tags are displayed alongside email details. 🏷️
41
+
42
+ - **Streamlined Presentation:** Display email information in an ordered format, including "From," "Subject," and associated tags for easy reference. 📊
43
+
44
+ - **Dropdown Summaries:** Provide a dropdown button for each email, allowing users to access the summary with a single click, enhancing readability. 📑
45
+
46
+ - **Original Email Link:** Include a convenient link at the end of each summary that directs users to the original email for further context and action. 🔗
47
+
48
+ ### Technology Stack 🛠️
49
+
50
+ - **Programming Languages:** Python
51
+ - **Web Framework:** Streamlit
52
+ - **Machine Learning Framework:** PyTorch
53
+ - **Email Access:** IMAP (Internet Message Access Protocol)
54
+ - **Text Processing:** Transformers library (from Hugging Face)
55
+ - **Logging:** Python Logging Library
56
+
57
+ ### Models Used 🧠
58
+
59
+ 1. **Large Language Model (LLM) for Summarization:**
60
+ - **Model:** `sshleifer/distilbart-cnn-12-6` (DistilBART)
61
+ - **Purpose:** Summarizes email content to generate concise summaries.
62
+ - **Utilized for:** Email summarization.
63
+
64
+ 2. **Spam Detection Model(Currently commented out as results are not accurate) :**
65
+ - **Model:** `1aurent/distilbert-base-multilingual-cased-finetuned-email-spam` (DistilBERT)
66
+ - **Purpose:** Detects spam emails within the inbox.
67
+ - **Utilized for:** Spam filtering.
68
+
69
+ 3. **Tag Generation Model:**
70
+ - **Model:** `fabiochiu/t5-base-tag-generation` (T5)
71
+ - **Purpose:** Generates tags to categorize and organize emails.
72
+ - **Utilized for:** Email tagging and categorization.
73
+
74
+
75
+ ### Running locally:
76
+ Requirements:
77
+ - Python 3.7+
78
+ - cuda (optional for GPU support)
79
+ - Minimum 4GB RAM is required to run the app
80
+
81
+ clone the repository:
82
+ ```bash
83
+ git clone https://github.com/Deven1902/Peerlist-Hackathon
84
+ ```
85
+ optionally create a virtual environment:
86
+ ```bash
87
+ python -m venv venv
88
+ ```
89
+ activate the virtual environment:
90
+
91
+ for Linux
92
+ ```bash
93
+ source venv/bin/activate
94
+ ```
95
+
96
+ for Windows
97
+ ```bash
98
+ venv\Scripts\activate
99
+ ```
100
+
101
+ install requirements using pip:
102
+ ```bash
103
+ pip install -r requirements.txt
104
+ ```
105
+
106
+ run the app:
107
+ ```bash
108
+ streamlit run app/streamlit_app.py
109
+ ```
110
+
111
+ ### Future Enhancements 🚀🔮
112
+
113
+ - **Improved Spam Filtering:** We'll enhance the accuracy of our spam detection algorithm for a cleaner inbox. 🚮
114
+
115
+ - **Tag Collections & Sharing:** Users can create and share tag collections, streamlining collaboration and productivity. 🚀📊
116
+
117
+ - **Custom Tag Generation:** Customize your own tag generation models for tailored email organization. 🏷️
118
+
119
+ - **User Preferences:** Fine-tune MailSensei with preferences like summarization length and tag rules for a personalized experience. ⚙️👤
120
+
121
+
122
+ *"Initially, we used OpenAI keys for all our app functions. However, we later changed our approach as some users may not have premium keys, and we wanted to ensure accessibility without additional costs. 😉😊"*
123
+
124
+
125
+
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import email_utils
3
+ import LLM
4
+
5
+
6
+ if "model" not in st.session_state.keys():
7
+ st.session_state["model"] = LLM.init()
8
+ model = st.session_state["model"]
9
+
10
+ if "credentials_set" not in st.session_state.keys():
11
+ st.session_state['credentials_set'] = False
12
+
13
+
14
+ def start():
15
+ with st.sidebar:
16
+ st.markdown("# Email Summarizer")
17
+ st.subheader("Email Credentials")
18
+ from_email = st.text_input("Email Address")
19
+ from_password = st.text_input("App Password", type="password")
20
+ if st.button("Set Credentials"):
21
+ if not from_email or not from_password:
22
+ st.error("Please provide both email address and password.")
23
+ else:
24
+ if email_utils.set_credentials(from_email, from_password):
25
+ st.session_state['credentials_set'] = True
26
+ st.success(f"Credentials set successfully. Email: {from_email}")
27
+ email_ids = email_utils.fetch_emails_from_imap(from_email, from_password)
28
+ st.session_state.update(email_ids=email_ids)
29
+ else:
30
+ st.error("Failed to set credentials")
31
+
32
+ if st.session_state['credentials_set']:
33
+ if st.button("Fetch Emails"):
34
+ try:
35
+ with st.spinner('Loading...'):
36
+ render_emails(from_email, from_password)
37
+ except Exception as e:
38
+ print(e)
39
+ else:
40
+ st.button("Fetch Emails", disabled=True)
41
+ st.warning("Please set credentials first", icon="⚠️",)
42
+
43
+
44
+ def render_emails(from_email, from_password, page_size=10):
45
+ """Renders the email messages in a Streamlit application with pagination.
46
+
47
+ Args:
48
+ email_messages: A list of email messages.
49
+ page_number: The current page number.
50
+ page_size: The number of emails to display per page.
51
+ """
52
+
53
+ page_number = st.session_state.get("page", 1)
54
+ email_ids = st.session_state.email_ids
55
+ start_index = (page_number - 1) * page_size
56
+ end_index = start_index + page_size
57
+
58
+ email_messages = email_utils.decode_emails(email_ids, start_index, end_index,
59
+ from_email, from_password)
60
+
61
+ for email_message in email_messages:
62
+
63
+ content = email_utils.strip_tags(email_message["content"])
64
+ summary = LLM.summarize(content, model[0])
65
+ tags = LLM.get_tags(content, model[2])
66
+ # spam = LLM.detect_spam(content, model[1])
67
+
68
+ # Add a redirect button that links to the original email
69
+ redirect_url = f'https://mail.google.com/mail/u/0/#search/rfc822msgid%3A{email_message["Message ID"]}'
70
+
71
+ EMAIL_FROM= email_message['from']
72
+ EMAIL_SUBJECT = email_message['subject'] if email_message['subject'].strip() else "No Subject"
73
+ EMAIL_TAGS = tags[0]['generated_text']
74
+ EMAIL_SUMMARY = summary[0]['summary_text']
75
+
76
+ with st.expander(
77
+
78
+ f"**From**:\n{EMAIL_FROM}\n\n**Subject**:\n{EMAIL_SUBJECT}\n\n**Tags**:\n{EMAIL_TAGS}\n\n"
79
+ ):
80
+ # tag_html = ''.join([f'<span style="display: inline-block; background-color: rgba(230, 230, 230, 0.2); padding: 3px 6px; margin-right: 8px; border-radius: 5px;">{tag}</span>' for tag in tags[0]['generated_text'].split(',')])
81
+
82
+ st.markdown(f"**Summary**:\n {EMAIL_SUMMARY}")
83
+ st.markdown(f"**[Read full e-mail]({redirect_url})**")
84
+
85
+ # Handle the button click event
86
+ # if redirect_button_clicked:
87
+ # st.write(f"Redirecting to Gmail: {redirect_url}")
88
+ # st.experimental_rerun() # Refresh the app to open the link
89
+
90
+ total = len(st.session_state.email_ids)
91
+
92
+ # Add buttons to allow the user to navigate between pages.
93
+ if page_number > 1:
94
+ st.button('Previous page',
95
+ on_click=lambda: (st.session_state.update(page=page_number - 1),
96
+ render_emails(from_email, from_password)))
97
+ if page_number < total:
98
+ st.button('Next page',
99
+ on_click=lambda: (st.session_state.update(page=page_number + 1),
100
+ render_emails(from_email, from_password)))
101
+
102
+
103
+ if __name__ == "__main__":
104
+ start()
email_utils.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from html.parser import HTMLParser
2
+ from io import StringIO
3
+ import email
4
+ import imaplib
5
+
6
+
7
+ def set_credentials(username, password):
8
+ """Sets the IMAP credentials. and check if the credentials are valid.
9
+
10
+ Args:
11
+ username: The Gmail username.
12
+ password: The Gmail password.
13
+ """
14
+ try:
15
+ imap_server = 'imap.gmail.com'
16
+ imap_port = 993
17
+
18
+ # Create an IMAP connection.
19
+ imap_connection = imaplib.IMAP4_SSL(imap_server, imap_port)
20
+
21
+ # Login to the IMAP server.
22
+ imap_connection.login(username, password)
23
+ return True
24
+
25
+ except:
26
+ return False
27
+
28
+
29
+ def fetch_emails_from_imap(username, password):
30
+ """Fetches emails from IMAP with pagination.
31
+
32
+ Args:
33
+ username: The Gmail username.
34
+ password: The Gmail password.
35
+ page_number: The current page number.
36
+ page_size: The number of emails to display per page.
37
+
38
+ Returns:
39
+ A list of email messages.
40
+ """
41
+
42
+ imap_server = 'imap.gmail.com'
43
+ imap_port = 993
44
+
45
+ # Create an IMAP connection.
46
+ imap_connection = imaplib.IMAP4_SSL(imap_server, imap_port)
47
+
48
+ # Login to the IMAP server.
49
+ imap_connection.login(username, password)
50
+ # print(f"{imap_connection.list()[1][0] = }")
51
+ # Select the INBOX mailbox.
52
+ imap_connection.select('INBOX', readonly=True)
53
+
54
+ # Search for all unread emails.
55
+ emails = imap_connection.search(None, 'X-GM-RAW "Category:Primary"', "UNSEEN")
56
+ # Get the email IDs.
57
+
58
+ email_ids = emails[1][0].decode().split(' ')
59
+ # Get the email messages for the current page.
60
+ imap_connection.close()
61
+
62
+ email_ids.reverse()
63
+
64
+ return email_ids
65
+
66
+ def decode_emails(email_ids, start_index, end_index, username, password):
67
+ imap_server = 'imap.gmail.com'
68
+ imap_port = 993
69
+
70
+ # Create an IMAP connection.
71
+ imap_connection = imaplib.IMAP4_SSL(imap_server, imap_port)
72
+ imap_connection.login(username, password)
73
+ imap_connection.select('INBOX', readonly=True)
74
+ email_messages = []
75
+
76
+ for email_id in email_ids[start_index:end_index]:
77
+ email_message = imap_connection.fetch(email_id, '(RFC822)')[1][0][1]
78
+ msg = email.message_from_bytes(
79
+ email_message
80
+ )
81
+ email_subject = msg['subject']
82
+ text, encoding = email.header.decode_header(msg['subject'])[0]
83
+ if encoding:
84
+ email_subject = text.decode(encoding)
85
+ email_from = msg['from']
86
+ email_content = ""
87
+
88
+ if msg.is_multipart():
89
+ for part in msg.walk():
90
+ if part.get_content_type() == "text/plain":
91
+ email_content = part.get_payload(decode=True).decode('utf-8',
92
+ errors='ignore')
93
+ break
94
+ else:
95
+ email_content = msg.get_payload(decode=True).decode('utf-8',
96
+ errors='ignore')
97
+
98
+ # Extract Message-ID, In-Reply-To, and References headers
99
+ message_id = msg.get("Message-ID", "")
100
+ in_reply_to = msg.get("In-Reply-To", "")
101
+
102
+ # Identify the thread or create a new one
103
+ SingleEmail = {
104
+ 'Message ID': message_id,
105
+ 'from': email_from,
106
+ 'subject': email_subject,
107
+ 'content': email_content,
108
+ 'IsReply': bool(in_reply_to), # Check if it's a reply
109
+ 'InReplyTo': in_reply_to, # Add the ID of the parent message
110
+ 'StoreReplyThread': [],
111
+ # 'summary': llm.summarize(email_content)
112
+ }
113
+
114
+ email_messages.append(SingleEmail)
115
+
116
+ # Close the IMAP connection.
117
+ imap_connection.close()
118
+
119
+ return email_messages
120
+
121
+
122
+ class MLStripper(HTMLParser):
123
+ def __init__(self):
124
+ super().__init__()
125
+ self.reset()
126
+ self.strict = False
127
+ self.convert_charrefs = True
128
+ self.text = StringIO()
129
+
130
+ def handle_data(self, d):
131
+ self.text.write(d)
132
+
133
+ def get_data(self):
134
+ return self.text.getvalue()
135
+
136
+
137
+ def strip_tags(html):
138
+ s = MLStripper()
139
+ s.feed(html)
140
+ return s.get_data()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ streamlit
3
+ --find-links https://download.pytorch.org/whl/torch_stable.html
4
+ torch
5
+ optimum
6
+ auto-gptq