Spaces:

bryanzhou008
/

contrastive-data-augmentation

Runtime error

App Files Files Community

bryanzhou008 commited on Jan 15, 2024

Commit

a103d54

verified ·

1 Parent(s): 1dab74f

Upload 5 files

Browse files

Files changed (5) hide show

app.py +21 -0
environment.yml +220 -0
src/v1.py +87 -0
src/v2.py +110 -0
src/v2_for_hf.py +90 -0

app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import gradio as gr
+from src.v2_for_hf import generate_images
+from src.v2_for_hf import NUM_GEN
+iface = gr.Interface(
+    fn=generate_images,
+    inputs=[
+        gr.Textbox(label="OpenAI API Key"),
+        gr.Image(label="Input Image", type="filepath"),
+        gr.Textbox(label="Mistaken Class"),
+        gr.Textbox(label="Ground Truth Class")
+    ],
+    outputs=[
+        gr.Image(label="Output Image") for i in range(NUM_GEN)
+    ],
+    title="visual-data-aug",
+)
+if __name__ == "__main__":
+    iface.launch(share=True)

environment.yml ADDED Viewed

	@@ -0,0 +1,220 @@

+name: torch_env
+channels:
+  - pytorch
+  - defaults
+  - conda-forge
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py38h6a678d5_7
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.12.12=h06a4308_0
+  - cryptography=41.0.7=py38hdda0065_0
+  - cudatoolkit=10.2.89=h713d32c_10
+  - ffmpeg=4.3=hf484d3e_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.1=h5eee18b_3
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py38heeb90bb_0
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.4=py38h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.2=py38h06a4308_0
+  - jpeg=9e=h5eee18b_1
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libdeflate=1.17=h5eee18b_1
+  - libffi=3.4.4=h6a678d5_0
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libwebp=1.3.2=h11a3e52_0
+  - libwebp-base=1.3.2=h5eee18b_0
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_0
+  - markupsafe=2.1.3=py38h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py38h5eee18b_1
+  - mkl_fft=1.3.8=py38h5eee18b_0
+  - mkl_random=1.2.4=py38hdb19cb5_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py38h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.1=py38h06a4308_0
+  - numpy=1.24.3=py38hf6e8229_1
+  - numpy-base=1.24.3=py38h060ed82_1
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.4.0=h3ad879b_0
+  - openssl=3.0.12=h7f8727e_0
+  - pip=23.3.1=py38h06a4308_0
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pyopenssl=23.2.0=py38h06a4308_0
+  - pysocks=1.7.1=py38h06a4308_0
+  - python=3.8.18=h955ad1f_0
+  - pytorch-mutex=1.0=cpu
+  - readline=8.2=h5eee18b_0
+  - requests=2.31.0=py38h06a4308_0
+  - sqlite=3.41.2=h5eee18b_0
+  - sympy=1.12=py38h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.12=h1ccaba5_0
+  - torchaudio=2.1.2=py38_cpu
+  - torchvision=0.16.2=py38_cpu
+  - typing_extensions=4.9.0=py38h06a4308_0
+  - wheel=0.41.2=py38h06a4308_0
+  - xz=5.4.5=h5eee18b_0
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_0
+  - zstd=1.5.5=hc292b87_0
+  - pip:
+      - accelerate==0.26.1
+      - aiofiles==23.2.1
+      - aiohttp==3.8.4
+      - aiosignal==1.3.1
+      - altair==5.2.0
+      - annotated-types==0.6.0
+      - anyio==4.2.0
+      - argon2-cffi==21.3.0
+      - argon2-cffi-bindings==21.2.0
+      - argparse==1.4.0
+      - asttokens==2.4.1
+      - async-timeout==4.0.3
+      - attrs==23.1.0
+      - backcall==0.2.0
+      - beautifulsoup4==4.12.2
+      - bleach==6.0.0
+      - certifi==2023.5.7
+      - cffi==1.15.1
+      - charset-normalizer==3.1.0
+      - click==8.1.3
+      - cmake==3.28.1
+      - colorama==0.4.6
+      - comm==0.2.1
+      - contourpy==1.1.1
+      - cycler==0.12.1
+      - datasets==2.13.1
+      - debugpy==1.8.0
+      - decorator==5.1.1
+      - diffusers==0.24.0
+      - dill==0.3.6
+      - distro==1.9.0
+      - exceptiongroup==1.2.0
+      - executing==2.0.1
+      - fastapi==0.109.0
+      - fastjsonschema==2.17.1
+      - ffmpy==0.3.1
+      - filelock==3.12.2
+      - fonttools==4.47.2
+      - frozenlist==1.4.1
+      - fsspec==2023.12.2
+      - gradio==4.14.0
+      - gradio-client==0.8.0
+      - h11==0.14.0
+      - httpcore==1.0.2
+      - httpx==0.26.0
+      - huggingface-hub==0.20.1
+      - importlib-metadata==6.7.0
+      - importlib-resources==6.1.1
+      - ipykernel==6.24.0
+      - ipython==8.12.2
+      - jedi==0.18.2
+      - joblib==1.3.1
+      - jsonschema==4.17.3
+      - jupyter-client==8.6.0
+      - jupyter-core==5.7.1
+      - kiwisolver==1.4.5
+      - lit==17.0.6
+      - markdown-it-py==3.0.0
+      - matplotlib==3.7.1
+      - matplotlib-inline==0.1.6
+      - mdurl==0.1.2
+      - multidict==6.0.4
+      - multiprocess==0.70.14
+      - nest-asyncio==1.5.8
+      - nltk==3.8.1
+      - nvidia-cublas-cu11==11.10.3.66
+      - nvidia-cuda-cupti-cu11==11.7.101
+      - nvidia-cuda-nvrtc-cu11==11.7.99
+      - nvidia-cuda-runtime-cu11==11.7.99
+      - nvidia-cudnn-cu11==8.5.0.96
+      - nvidia-cufft-cu11==10.9.0.58
+      - nvidia-curand-cu11==10.2.10.91
+      - nvidia-cusolver-cu11==11.4.0.1
+      - nvidia-cusparse-cu11==11.7.4.91
+      - nvidia-nccl-cu11==2.14.3
+      - nvidia-nvtx-cu11==11.7.91
+      - openai==1.6.1
+      - orjson==3.9.10
+      - packaging==23.2
+      - pandas==2.0.3
+      - parso==0.8.3
+      - pexpect==4.9.0
+      - pickleshare==0.7.5
+      - pillow==10.0.0
+      - pkgutil-resolve-name==1.3.10
+      - platformdirs==4.1.0
+      - prompt-toolkit==3.0.38
+      - psutil==5.9.5
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.2
+      - pyarrow==14.0.2
+      - pydantic==2.5.3
+      - pydantic-core==2.14.6
+      - pydub==0.25.1
+      - pygments==2.15.1
+      - pyparsing==3.1.0
+      - pyrsistent==0.20.0
+      - python-dateutil==2.8.2
+      - python-multipart==0.0.6
+      - pytz==2023.3
+      - pyyaml==6.0
+      - pyzmq==25.1.2
+      - regex==2023.12.25
+      - rich==13.7.0
+      - safetensors==0.4.1
+      - scikit-learn==1.3.0
+      - scipy==1.10.1
+      - semantic-version==2.10.0
+      - sentence-transformers==2.2.2
+      - sentencepiece==0.1.99
+      - setuptools==67.8.0
+      - shellingham==1.5.4
+      - six==1.16.0
+      - sniffio==1.3.0
+      - soupsieve==2.5
+      - stack-data==0.6.3
+      - starlette==0.35.1
+      - threadpoolctl==3.2.0
+      - tokenizers==0.13.3
+      - tomlkit==0.12.0
+      - toolz==0.12.0
+      - torch==2.0.1
+      - tornado==6.4
+      - tqdm==4.65.0
+      - traitlets==5.14.1
+      - transformers==4.30.2
+      - triton==2.0.0
+      - typer==0.9.0
+      - tzdata==2023.4
+      - urllib3==2.0.3
+      - uvicorn==0.25.0
+      - wcwidth==0.2.13
+      - webencodings==0.5.1
+      - websockets==11.0.3
+      - xxhash==3.4.1
+      - yarl==1.9.4
+      - zipp==3.17.0
+      - gradio
+prefix: /home/bingxuan/anaconda3/envs/torch_env

src/v1.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from openai import OpenAI
+import base64
+import requests
+import re
+from diffusers import DiffusionPipeline
+import torch
+from PIL import Image
+import os
+import argparse
+SD_pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
+SD_pipe.to("cuda")
+RF_pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
+RF_pipe.to("cuda")
+# Function to encode the image
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+def vision_gpt(prompt, image_url, api_key):
+    client = OpenAI(api_key=api_key)
+    response = client.chat.completions.create(
+        model="gpt-4-vision-preview",
+        messages=[
+            {
+              "role": "user",
+              "content": [
+                  {"type": "text",
+                   "text": prompt},
+                  {
+                      "type": "image_url",
+                      "image_url": {
+                          "url": f"data:image/jpeg;base64,{image_url}", },
+                  },
+              ],
+            }
+        ],
+        max_tokens=600,
+    )
+    return response.choices[0].message.content
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="extract differentiating attributes of the gt object class from the mistaken object class, generate synthatic images of the gt class highlighting such attributes")
+    parser.add_argument('-i', "--input_path", type=str, metavar='', required=True, help="path to input image")
+    parser.add_argument('-o', "--output_path", type=str, metavar='', required=True, help="path to output folder")
+    parser.add_argument('-k', "--api_key", type=str, metavar='', required=True, help="valid openai api key")
+    parser.add_argument('-m', "--mistaken_class", type=str, metavar='', required=True, help="model wrongly predicted this class")
+    parser.add_argument('-g', "--ground_truth_class", type=str, metavar='', required=True, help="the ground truth class of the image")
+    parser.add_argument('-n', "--num_generations", type=int, metavar='', required=False, default=5, help="number of generations")
+    args = parser.parse_args()
+    gt, ms = args.ground_truth_class, args.mistaken_class
+    if os.path.exists(args.output_path):
+        pass
+    else:
+        os.mkdir(args.output_path)
+    base64_image = encode_image(args.input_path)
+    prompt = """List features of the {} in this image that make it distinct from a {}? Then, write a short and
+    concise non-artistic visual diffusion prompt of a {} that includes the above features of {} (starting
+    with 'photorealistic candid portrait of') and put it inside square brackets []. Do no mention {} in
+    your prompt and ignore unrelated background scenes.""".format(gt, ms, gt, gt, ms, ms)
+    print("--------------gpt prompt--------------: \n", prompt, "\n\n")
+    response = vision_gpt(prompt, base64_image, args.api_key)
+    print("--------------GPT response--------------: \n", response, "\n\n")
+    stable_diffusion_prompt =  re.search(r'\[(.*?)\]', response).group(1)
+    print("--------------stable_diffusion_prompt-------------- \n", stable_diffusion_prompt, "\n\n")
+    for i in range(args.num_generations):
+        generated_images = SD_pipe(prompt=stable_diffusion_prompt, num_inference_steps=75).images
+        refined_image = RF_pipe(prompt=stable_diffusion_prompt, image=generated_images).images[0]
+        refined_image.save(args.output_path + "{}.png".format(i), 'PNG')

src/v2.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from openai import OpenAI
+import base64
+import requests
+import re
+from diffusers import DiffusionPipeline
+import torch
+from PIL import Image
+import os
+import argparse
+# Function to encode the image
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+# Function to retrieve openai api key
+def get_openai_key(key_path):
+	with open(key_path) as f:
+		key = f.read().strip()
+	print("Reading OpenAI API key from: ", key_path)
+	return key
+# Function to obtain GPT4V response
+def vision_gpt(prompt, image_url, api_key):
+    client = OpenAI(api_key=api_key)
+    response = client.chat.completions.create(
+        model="gpt-4-vision-preview",
+        messages=[
+            {
+              "role": "user",
+              "content": [
+                  {"type": "text",
+                   "text": prompt},
+                  {
+                      "type": "image_url",
+                      "image_url": {
+                          "url": f"data:image/jpeg;base64,{image_url}", },
+                  },
+              ],
+            }
+        ],
+        max_tokens=600,
+    )
+    return response.choices[0].message.content
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="extract differentiating attributes of the gt object class from the mistaken object class, generate synthatic images of the gt class highlighting such attributes")
+    parser.add_argument('-i', "--input_path", type=str, metavar='', required=True, help="path to input image")
+    parser.add_argument('-o', "--output_path", type=str, metavar='', required=True, help="path to output folder")
+    parser.add_argument('-k', "--api_key_path", type=str, metavar='', required=True, help="path to file containing openai api key")
+    parser.add_argument('-m', "--mistaken_class", type=str, metavar='', required=True, help="model wrongly predicted this class")
+    parser.add_argument('-g', "--ground_truth_class", type=str, metavar='', required=True, help="the ground truth class of the image")
+    parser.add_argument('-n', "--num_generations", type=int, metavar='', required=False, default=5, help="number of generations")
+    args = parser.parse_args()
+    gt, ms = args.ground_truth_class, args.mistaken_class
+    oai_key = get_openai_key(args.api_key_path)
+    if os.path.exists(args.output_path):
+        pass
+    else:
+        os.mkdir(args.output_path)
+    base64_image = encode_image(args.input_path)
+    prompt = """
+    List key features of the {} itself in this image that make it distinct from a {}? Then, write a very short and
+    concise visual midjourney prompt of the {} that includes the above features of {} (prompt should start
+    with '4K SLR photo,') and put it inside square brackets []. Do no mention {} in your prompt, also do not mention
+    non-essential background scenes like "calm waters, mountains" and sub-components like "paddle of canoe" in the prompt.
+    """.format(gt, ms, gt, gt, ms, ms)
+    # prompt = """
+    # List features of the {} in this image that make it distinct from a {}? Then, write a very short and
+    # concise non-artistic visual diffusion prompt of a {} that includes the above features of {} (starting
+    # with 'photo,') and put it inside square brackets []. Do no mention {} in
+    # your prompt, ignore unrelated background scenes, non-essential sub-components, objects, and people.
+    # """.format(gt, ms, gt, gt, ms, ms)
+    print("--------------gpt prompt--------------: \n", prompt, "\n\n")
+    response = vision_gpt(prompt, base64_image, oai_key)
+    print("--------------GPT response--------------: \n", response, "\n\n")
+    stable_diffusion_prompt =  re.search(r'\[(.*?)\]', response).group(1)
+    print("--------------stable_diffusion_prompt-------------- \n", stable_diffusion_prompt, "\n\n")
+    SD_pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
+    SD_pipe.to("cuda")
+    RF_pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
+    RF_pipe.to("cuda")
+    for i in range(args.num_generations):
+        generated_images = SD_pipe(prompt=stable_diffusion_prompt, num_inference_steps=75).images
+        refined_image = RF_pipe(prompt=stable_diffusion_prompt, image=generated_images).images[0]
+        # refined_image = RF_pipe(prompt=stable_diffusion_prompt, image=refined_image).images[0]
+        # refined_image = RF_pipe(prompt=stable_diffusion_prompt, image=refined_image).images[0]
+        refined_image.save(args.output_path + "{}.png".format(i), 'PNG')

src/v2_for_hf.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from openai import OpenAI
+import base64
+import requests
+import re
+from diffusers import DiffusionPipeline
+import torch
+from PIL import Image
+import os
+from huggingface_hub import login
+with open("key.txt", "r") as f:
+    login(token=f.read().strip())
+# Modfiy this to change the number of generations
+NUM_GEN = 2
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+def vision_gpt(prompt, image_url, api_key):
+    client = OpenAI(api_key=api_key)
+    response = client.chat.completions.create(
+        model="gpt-4-vision-preview",
+        messages=[
+            {
+              "role": "user",
+              "content": [
+                  {"type": "text",
+                   "text": prompt},
+                  {
+                      "type": "image_url",
+                      "image_url": {
+                          "url": f"data:image/jpeg;base64,{image_url}", },
+                  },
+              ],
+            }
+        ],
+        max_tokens=600,
+    )
+    return response.choices[0].message.content
+def generate_images(oai_key, input_path, mistaken_class, ground_truth_class):
+    output_path = "out/"
+    num_generations = 2
+    print("--------------input_path--------------: \n", input_path, "\n\n")
+    base64_image = encode_image(input_path)
+    prompt = """
+    List key features of the {} itself in this image that make it distinct from a {}? Then, write a very short and
+    concise visual midjourney prompt of the {} that includes the above features of {} (prompt should start
+    with '4K SLR photo,') and put it inside square brackets []. Do no mention {} in your prompt, also do not mention
+    non-essential background scenes like "calm waters, mountains" and sub-components like "paddle of canoe" in the prompt.
+    """.format(ground_truth_class, mistaken_class, ground_truth_class, ground_truth_class, mistaken_class, mistaken_class)
+    print("--------------gpt prompt--------------: \n", prompt, "\n\n")
+    response = vision_gpt(prompt, base64_image, oai_key)
+    print("--------------GPT response--------------: \n", response, "\n\n")
+    stable_diffusion_prompt =  re.search(r'\[(.*?)\]', response).group(1)
+    print("--------------stable_diffusion_prompt-------------- \n", stable_diffusion_prompt, "\n\n")
+    SD_pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
+    RF_pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
+    SD_pipe.to("cuda")
+    RF_pipe.to("cuda")
+    out_images = []
+    for i in range(NUM_GEN):
+        generated_images = SD_pipe(prompt=stable_diffusion_prompt, num_inference_steps=75).images
+        refined_image = RF_pipe(prompt=stable_diffusion_prompt, image=generated_images).images[0]
+        refined_image = RF_pipe(prompt=stable_diffusion_prompt, image=refined_image).images[0]
+        refined_image = RF_pipe(prompt=stable_diffusion_prompt, image=refined_image).images[0]
+        # refined_image.save(output_path + "{}.png".format(i), 'PNG')
+        out_images.append(refined_image)
+    return tuple(out_images)
+if __name__ == "__main__":
+    oai_key = "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"
+    input_path = "out/0.png"
+    mistaken_class = "dog"
+    ground_truth_class = "cat"
+    generate_images(oai_key, input_path, mistaken_class, ground_truth_class)