1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
| import os import json from PIL import Image import torch from transformers import Qwen2VLForConditionalGeneration, AutoProcessor from tqdm import tqdm
torch.manual_seed(1234)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}")
model = Qwen2VLForConditionalGeneration.from_pretrained( "/home/lijiawei/model/hub/qwen/qwen2-VLtune", torch_dtype=torch.float32, device_map={"": device}, trust_remote_code=True ).eval()
processor = AutoProcessor.from_pretrained("/home/lijiawei/model/hub/qwen/qwen2-VLtune")
input_folder = "/home/lijiawei/extracted_images" output_json = "/home/lijiawei/model/processed_results.json"
supported_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')
image_paths = [ os.path.join(input_folder, img) for img in os.listdir(input_folder) if img.lower().endswith(supported_extensions) ]
print(f"找到 {len(image_paths)} 张图片。")
results = []
def process_single_image(image_path): try: image = Image.open(image_path).convert("RGB")
messages = [ { "role": "user", "content": [ { "type": "image", "image": image_path, }, {"type": "text", "text": "Please describe the pedestrian in the image"}, ], } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True )
inputs = processor( text=[text], images=[image], padding=True, return_tensors="pt" )
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad(): generated_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids_trimmed = [ generated_ids[i][len(inputs['input_ids'][i]):] for i in range(len(generated_ids)) ]
output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )
description = output_text[0] if output_text else "No description generated."
del inputs, generated_ids, generated_ids_trimmed, output_text torch.cuda.empty_cache()
return { "image_path": image_path, "description": description }
except Exception as e: print(f"Error processing {image_path}: {e}") return { "image_path": image_path, "description": "Processing Failed" }
for image_path in tqdm(image_paths, desc="Processing Images"): result = process_single_image(image_path) results.append(result)
with open(output_json, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=4)
print(f"批量图片处理完成,结果已保存到 {output_json}")
|