|
from transformers import TrOCRProcessor, VisionEncoderDecoderModel |
|
import pandas as pd |
|
from PIL import Image |
|
|
|
|
|
model_finetune = VisionEncoderDecoderModel.from_pretrained("hadrakey/alphapen_trocr") |
|
|
|
|
|
model_base = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") |
|
|
|
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") |
|
|
|
|
|
df_path = "/mnt/data1/Datasets/AlphaPen/" + "testing_data.csv" |
|
data = pd.read_csv(df_path) |
|
data.dropna(inplace=True) |
|
data.reset_index(inplace=True) |
|
|
|
root_dir = "/mnt/data1/Datasets/OCR/Alphapen/clean_data/" |
|
|
|
inf_baseline = [] |
|
inf_finetune = [] |
|
for idx in range(len(data)): |
|
image = Image.open(root_dir + "final_cropped_rotated_" + data.filename[idx]).convert("RGB") |
|
|
|
pixel_values = processor(image, return_tensors="pt").pixel_values |
|
generated_ids_base = model_base.generate(pixel_values) |
|
generated_ids_fine = model_finetune.generate(pixel_values) |
|
generated_text_base = processor.batch_decode(generated_ids_base, skip_special_tokens=True)[0] |
|
generated_text_fine= processor.batch_decode(generated_ids_fine, skip_special_tokens=True)[0] |
|
|
|
inf_baseline.append(generated_text_base) |
|
inf_finetune.append(generated_text_fine) |
|
|
|
data["Baseline"]=inf_baseline |
|
data["Finetune"]=inf_finetune |
|
|
|
data.to_csv("/mnt/data1/Datasets/AlphaPen/" + "inference_data.csv") |