Spaces:

mdirshad09
/

FaceNet

Runtime error

App Files Files Community

mdirshad09 commited on Dec 13, 2023

Commit

2519bba

1 Parent(s): d74cfc8

Upload 8 files

Browse files

Files changed (8) hide show

app.py +136 -0
data_extractor.py +50 -0
embeddings_generation.py +12 -0
models.py +21 -0
pinecone.py +46 -0
requirements.txt +77 -0
scrapper.py +140 -0
testing.py +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from src.data_extractor import DataExtractor
+from src.models import FaceNetModel
+from src.pinecone import Pinecone
+from src.embeddings_generation import FaceEmbedding
+import streamlit as st, numpy as np, cv2
+from PIL import Image
+data_extractor = DataExtractor('./data/Json', './data/Images')
+combined_data, paths = data_extractor.concat_data()
+model = FaceNetModel()
+mtcnn, resnet = model.initialize_model()
+transform = model.get_transform()
+embeddings = FaceEmbedding(transform, resnet)
+pinecone = Pinecone('c984cd49-42a6-4aa0-b2f2-e96cfb8f59bc', 'gcp-starter', 'facenet')
+pinecone_index = pinecone.initialize_index()
+def process_images():
+    count = 0
+    for index, image_path in enumerate(paths):
+        try:
+            img = Image.open(image_path)
+            img = img.convert("RGB")
+            width, height = img.size
+            boxes, _ = mtcnn.detect(img)
+            id = combined_data['id'][index]
+            img_url = combined_data['Image_URL'][index]
+            page_url = combined_data['Page_URL'][index]
+            if len(boxes) == 1:
+                # print(index)
+                try:
+                    face_embedding = embeddings.calculate_face_embedding(img, boxes[0])
+                    x1, y1, x2, y2 = [int(coord) for coord in boxes[0]]
+                    coordinates = [x1/width, y1/height, x2/width, y2/height]
+                    pinecone.upsert_data(id, face_embedding, image_path, img_url, page_url, coordinates, True)
+                except Exception as e:
+                    print(e)
+                    continue
+            if len(boxes) > 1:
+                for box in boxes:
+                    # print(index)
+                    try:
+                        face_embedding = embeddings.calculate_face_embedding(img, box)
+                        x1, y1, x2, y2 = [int(coord) for coord in box]
+                        coordinates = [x1/width, y1/height, x2/width, y2/height]
+                        ### store data
+                        pinecone.upsert_data(id, face_embedding, image_path, img_url, page_url, coordinates, False)
+                    except Exception as e:
+                        print(e)
+                        continue
+        except FileNotFoundError:
+            print(f"File not found: {image_path}")
+        except OSError:
+            print(f"Not an image file or image file is corrupted: {image_path}")
+        except MemoryError:
+            print(f"Out of memory when trying to open image: {image_path}")
+        count+=1
+    print(count)
+def search_images(query_img):
+    boxes, _ = mtcnn.detect(query_img)
+    query_embedding = embeddings.calculate_face_embedding(query_img, boxes[0])
+    query_embedding = query_embedding.tolist()
+    return pinecone.search_data(query_embedding)
+def get_image():
+    st.title("Image Upload")
+    image_file = st.file_uploader("Upload Image", type=['png', 'jpeg', 'jpg', 'jfif'])
+    if image_file is not None:
+        image = Image.open(image_file)
+        st.image(image, use_column_width=True)
+        matches = search_images(image)
+        return matches
+def display_image(image):
+    st.image(image, use_column_width=True)
+def process_matches(matches):
+    for match in matches['matches']:
+        if match['metadata']['Single Face'] == False:
+            img_id = match['metadata']['Image id']
+            results = pinecone_index.query(vector = match['values'], top_k = 4, include_values = False, include_metadata = True, filter={'Image id': {'$eq': img_id}})
+            path = match['metadata']['directory path']
+            image = Image.open(path)
+            width, height = image.size
+            for face in results['matches']:
+                if face['score'] < 0.9:
+                    normalized_coordinates = face['metadata']['Face Coordinates']
+                    normalized_coordinates = [float(item) for item in normalized_coordinates]
+                    coordinates = [normalized_coordinates[0] * width, normalized_coordinates[1] * height, normalized_coordinates[2] * width, normalized_coordinates[3] * height]
+                    x1, y1, x2, y2 = [int(coord) for coord in coordinates]
+                    face_width = x2 - x1
+                    face_height = y2 - y1
+                    face_region = np.array(image.crop(tuple(coordinates)))
+                    blurred_face_region = cv2.GaussianBlur(face_region, (99, 99), 20)
+                    blurred_face_image = Image.fromarray(blurred_face_region)
+                    if blurred_face_image.size != (face_width, face_height):
+                        blurred_face_image = blurred_face_image.resize((face_width, face_height))
+                    image.paste(blurred_face_image, (x1, y1))
+            display_image(image)
+        else:
+            path = match['metadata']['directory path']
+            img = Image.open(path)
+            display_image(img)
+if __name__ == "__main__":
+    # process_images()
+    matches = get_image()
+    if matches is not None:
+            process_matches(matches)

data_extractor.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import glob
+import json
+import pandas as pd
+class DataExtractor:
+    def __init__(self, json_folder_path, image_root_directory):
+        self.json_folder_path = json_folder_path
+        self.image_root_directory = image_root_directory
+    def extract_json_data(self):
+        extracted_data = []
+        for filename in os.listdir(self.json_folder_path):
+            if filename.endswith(".json"):
+                with open(os.path.join(self.json_folder_path, filename), 'r') as json_file:
+                    data = json.load(json_file)
+                    if 'query' in data and 'images' in data:
+                        query = data['query']
+                        images = data['images']
+                        for image_data in images:
+                            extracted_data.append({
+                                'Class': query,
+                                'id': image_data['Id'],
+                                'Image_URL': image_data['url'],
+                                'Title': image_data['title'],
+                                'Page_URL': image_data['page_url']
+                            })
+        return pd.DataFrame(extracted_data)
+    def extract_image_paths(self):
+        extracted_data = []
+        image_files = glob.glob(os.path.join(self.image_root_directory, '**', '*.jpg'), recursive=True)
+        for image_file in image_files:
+            class_name = os.path.basename(os.path.dirname(image_file))
+            id_name = os.path.splitext(os.path.basename(image_file))[0]
+            extracted_data.append({
+                'Class': class_name,
+                'id': id_name,
+                'Image_Path': image_file
+            })
+        return pd.DataFrame(extracted_data)
+    def concat_data(self):
+        json_data = self.extract_json_data()
+        image_data = self.extract_image_paths()
+        combined_data = pd.merge(json_data, image_data, on=['id'], how='inner')
+        paths = combined_data['Image_Path']
+        print(paths)
+        return combined_data, paths

embeddings_generation.py ADDED Viewed

	@@ -0,0 +1,12 @@

+class FaceEmbedding:
+    def __init__(self, transform, resnet):
+        self.transform = transform
+        self.resnet = resnet
+    def calculate_face_embedding(self, image, box):
+        face = image.crop(box)
+        face = face.convert("RGB")
+        face = self.transform(face)
+        face = face.unsqueeze(0)
+        face_embedding = self.resnet(face)
+        return face_embedding

models.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from facenet_pytorch import InceptionResnetV1, MTCNN
+import torchvision.transforms as transforms
+class FaceNetModel:
+    def __init__(self):
+        self.mtcnn = None
+        self.resnet = None
+        self.transform = None
+    def initialize_model(self):
+        self.mtcnn = MTCNN()
+        self.resnet = InceptionResnetV1(pretrained='vggface2').eval()
+        return self.mtcnn, self.resnet
+    def get_transform(self):
+        self.transform = transforms.Compose([
+            transforms.Resize((250, 250)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+        return self.transform

pinecone.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import uuid
+import pinecone
+from pinecone import PineconeProtocolError
+class Pinecone:
+    def __init__(self, api_key, environment, index_name):
+        self.api_key = api_key
+        self.environment = environment
+        self.index_name = index_name
+        self.index = None
+    def initialize_index(self):
+        pinecone.init(api_key=self.api_key, environment=self.environment)
+        self.index = pinecone.Index(self.index_name)
+        return self.index
+    def upsert_data(self, img_id, embeddings, path, img_url, page_url, face_coordinates, single_face):
+        vec_id = str(uuid.uuid4())
+        data = []
+        embedding_as_list = embeddings.tolist()
+        if face_coordinates is not None:
+            coordinates_1d = [str(coord) for coord in face_coordinates]
+            metadata = {'Image id': img_id, 'directory path': path, 'Image URL': img_url, 'Page URL': page_url, 'Face Coordinates': coordinates_1d, 'Single Face': single_face}
+        data.append((vec_id, embedding_as_list, metadata))
+        self.index.upsert(data)
+    def search_data(self, query_embedding):
+        try:
+            matches = self.index.query(
+                vector=query_embedding,
+                top_k=10,
+                include_values=True,
+                include_metadata = True
+            )
+        except PineconeProtocolError as e:
+            print(f"PineconeProtocolError occurred: {e}")
+            pinecone.deinit()
+            pinecone.init(api_key= self.api_key,environment=self.environment)
+            index = pinecone.Index(self.index_name)
+            matches = index.query(
+                vector=query_embedding,
+                top_k=10,
+                include_values=True,
+                include_metadata = True
+            )
+        return matches

requirements.txt ADDED Viewed

	@@ -0,0 +1,77 @@

+altair==5.1.2
+asgiref==3.7.2
+attrs==23.1.0
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+Django==4.2.7
+dnspython==2.4.2
+exceptiongroup==1.1.3
+facenet-pytorch==2.5.3
+filelock==3.13.1
+fsspec==2023.10.0
+gitdb==4.0.11
+GitPython==3.1.40
+h11==0.14.0
+idna==3.4
+image==1.5.33
+importlib-metadata==6.8.0
+Jinja2==3.1.2
+jsonschema==4.19.2
+jsonschema-specifications==2023.7.1
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.26.1
+opencv-python==4.8.1.78
+outcome==1.3.0.post0
+packaging==23.2
+pandas==2.1.2
+Pillow==10.1.0
+pinecone-client==2.2.4
+protobuf==4.25.0
+pyarrow==14.0.0
+pycparser==2.21
+pydeck==0.8.1b0
+Pygments==2.16.1
+PySocks==1.7.1
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.30.2
+requests==2.31.0
+rich==13.6.0
+rpds-py==0.12.0
+selenium==4.15.2
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.0
+sortedcontainers==2.4.0
+sqlparse==0.4.4
+streamlit==1.28.1
+sympy==1.12
+tenacity==8.2.3
+toml==0.10.2
+toolz==0.12.0
+torch==2.1.0
+torchvision==0.16.0
+tornado==6.3.3
+tqdm==4.66.1
+trio==0.23.1
+trio-websocket==0.11.1
+typing_extensions==4.8.0
+tzdata==2023.3
+tzlocal==5.2
+urllib3==2.0.7
+validators==0.22.0
+watchdog==3.0.0
+win32-setctime==1.1.0
+wsproto==1.2.0
+zipp==3.17.0

scrapper.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import selenium
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+import time
+import requests
+import os
+import random
+import hashlib
+import json
+user_agents = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.1234.56 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/101.0.1234.56 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/101.0.1234.56",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/14.1.2",
+]
+def fetch_image_data(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 5):
+    def scroll_to_end(wd):
+        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(sleep_between_interactions)
+    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
+    wd.get(search_url.format(q=query))
+    image_data_list = []
+    image_count = 0
+    results_start = 0
+    while image_count < max_links_to_fetch:
+        scroll_to_end(wd)
+        # Get all image thumbnail results
+        thumbnail_results = wd.find_elements(By.CLASS_NAME, "Q4LuWd")
+        number_results = len(thumbnail_results)
+        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
+        done = False
+        for img in thumbnail_results[results_start:number_results]:
+            try:
+                img.click()
+                time.sleep(sleep_between_interactions)
+            except Exception:
+                continue
+            # Extract image data: URL, title, and dimensions
+            actual_images = wd.find_elements(By.CLASS_NAME, 'pT0Scc')
+            for actual_image in actual_images:
+                print("ACTUAL IMAGE: ", actual_image)
+                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
+                    image_url = actual_image.get_attribute('src')
+                    response = requests.get(image_url)
+                    if response.status_code == 200:
+                        image_title = actual_image.get_attribute('alt')
+                        # Find the parent <a> tag of the image for the page URL
+                        parent_a_tag = actual_image.find_element(By.XPATH, './ancestor::a')
+                        # Get the page URL directly from the parent <a> tag
+                        image_page_url = parent_a_tag.get_attribute('href')
+                        # Create a folder for the specific query if it doesn't exist
+                        query_folder = os.path.join('images', query)
+                        if not os.path.exists(query_folder):
+                            os.makedirs(query_folder)
+                        # Generate a unique file name using the URL hash
+                        file_name = hashlib.sha1(image_url.encode()).hexdigest()[:10]
+                        # Create a file path with the .jpg extension
+                        file_path = os.path.join(query_folder, f"{file_name}.jpg")
+                        # id = id.split('/')[-1]
+                        # Save the image
+                        with open(file_path, 'wb') as f:
+                            f.write(response.content)
+                        print(f"SUCCESS - saved {image_url} - as {file_path}")
+                        # Store the metadata in the list
+                        image_data_list.append({
+                            "url": image_url,
+                            "title": image_title,
+                            "page_url": image_page_url,
+                            "Id": file_name
+                        })
+                        image_count += 1  # Increment the image count
+                        if image_count >= max_links_to_fetch:
+                            print(f"Found: {len(image_data_list)} images, done!")
+                            done = True
+                            break  # Exit the loop
+                if done:
+                    break
+            if done:
+                break
+        # Move the result start point further down
+        results_start = len(thumbnail_results)
+    return image_data_list
+if __name__ == '__main__':
+    # Select a random user agent
+    selected_user_agent = random.choice(user_agents)
+    # Set the user agent for Edge driver
+    options = webdriver.EdgeOptions()
+    options.add_argument(f'user-agent={selected_user_agent}')
+    # Initialize the Edge driver with the specified user agent
+    wd = webdriver.Edge(options=options)
+    queries = ["Elon Musk", "Barack Obama", "Taylor Swift", "Bill Gates", "Eminem"]  # change your set of queries here
+    for query in queries:
+        num_of_images = 20
+        wd.get('https://google.com')
+        search_box = wd.find_element(By.NAME, 'q')
+        search_box.send_keys(query)
+        image_data_list = fetch_image_data(query, num_of_images, wd)
+        # Create a dictionary to store the image data
+        query_image_data = {
+            "query": query,
+            "images": image_data_list
+        }
+        # Serialize the image data dictionary to JSON
+        json_data = json.dumps(query_image_data, indent=4)
+        # Save the JSON data to a file with the query name
+        json_filename = f"{query}.json"
+        with open(json_filename, 'w') as json_file:
+            json_file.write(json_data)
+    wd.quit()

testing.py ADDED Viewed

File without changes