XFFXFF commited on
Commit
ad9ba0d
·
1 Parent(s): 6583f58

update readme

Browse files
Files changed (1) hide show
  1. README.md +26 -24
README.md CHANGED
@@ -60,7 +60,10 @@ base_model:
60
  ## Quick Start
61
  ### Installation
62
  ```
63
- pip install transformers==4.45.0 accelerate==0.34.1 sentencepiece==0.2.0 torchvision requests torch Pillow
 
 
 
64
  pip install flash-attn --no-build-isolation
65
 
66
  # For better inference performance, you can install grouped-gemm, which may take 3-5 minutes to install
@@ -77,23 +80,24 @@ Here is a code snippet to show you how to use Aria.
77
  import requests
78
  import torch
79
  from PIL import Image
80
- from transformers import AutoModelForCausalLM, AutoProcessor
81
 
82
- model_id_or_path = "rhymes-ai/Aria"
83
 
84
- model = AutoModelForCausalLM.from_pretrained(model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
85
 
86
- processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True)
 
 
 
87
 
88
- image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"
89
 
90
- image = Image.open(requests.get(image_path, stream=True).raw)
91
 
92
  messages = [
93
  {
94
  "role": "user",
95
  "content": [
96
- {"text": None, "type": "image"},
97
  {"text": "what is the image?", "type": "text"},
98
  ],
99
  }
@@ -101,22 +105,20 @@ messages = [
101
 
102
  text = processor.apply_chat_template(messages, add_generation_prompt=True)
103
  inputs = processor(text=text, images=image, return_tensors="pt")
104
- inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)
105
- inputs = {k: v.to(model.device) for k, v in inputs.items()}
106
-
107
- with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
108
- output = model.generate(
109
- **inputs,
110
- max_new_tokens=500,
111
- stop_strings=["<|im_end|>"],
112
- tokenizer=processor.tokenizer,
113
- do_sample=True,
114
- temperature=0.9,
115
- )
116
- output_ids = output[0][inputs["input_ids"].shape[1]:]
117
- result = processor.decode(output_ids, skip_special_tokens=True)
118
-
119
- print(result)
120
  ```
121
 
122
  ### Advanced Inference and Fine-tuning
 
60
  ## Quick Start
61
  ### Installation
62
  ```
63
+ # Install transformers from GitHub until the next release includes the Aria model
64
+ pip install git+https://github.com/huggingface/transformers.git
65
+
66
+ pip install accelerate sentencepiece torchvision requests torch Pillow
67
  pip install flash-attn --no-build-isolation
68
 
69
  # For better inference performance, you can install grouped-gemm, which may take 3-5 minutes to install
 
80
  import requests
81
  import torch
82
  from PIL import Image
 
83
 
84
+ from transformers import AriaProcessor, AriaForConditionalGeneration
85
 
 
86
 
87
+ model_id_or_path = "rhymes-ai/Aria"
88
+ model = AriaForConditionalGeneration.from_pretrained(
89
+ model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16
90
+ )
91
 
92
+ processor = AriaProcessor.from_pretrained(model_id_or_path)
93
 
94
+ image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
95
 
96
  messages = [
97
  {
98
  "role": "user",
99
  "content": [
100
+ {"type": "image"},
101
  {"text": "what is the image?", "type": "text"},
102
  ],
103
  }
 
105
 
106
  text = processor.apply_chat_template(messages, add_generation_prompt=True)
107
  inputs = processor(text=text, images=image, return_tensors="pt")
108
+ inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
109
+ inputs.to(model.device)
110
+
111
+ output = model.generate(
112
+ **inputs,
113
+ max_new_tokens=15,
114
+ stop_strings=["<|im_end|>"],
115
+ tokenizer=processor.tokenizer,
116
+ do_sample=True,
117
+ temperature=0.9,
118
+ )
119
+ output_ids = output[0][inputs["input_ids"].shape[1]:]
120
+ response = processor.decode(output_ids, skip_special_tokens=True)
121
+ print(response)
 
 
122
  ```
123
 
124
  ### Advanced Inference and Fine-tuning