| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 
 | import osimport json
 from PIL import Image
 import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from tqdm import tqdm
 
 
 torch.manual_seed(1234)
 
 
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"使用设备: {device}")
 
 
 model = Qwen2VLForConditionalGeneration.from_pretrained(
 "/home/lijiawei/model/hub/qwen/qwen2-VLtune",
 torch_dtype=torch.float32,
 device_map={"": device},
 trust_remote_code=True
 ).eval()
 
 
 processor = AutoProcessor.from_pretrained("/home/lijiawei/model/hub/qwen/qwen2-VLtune")
 
 
 input_folder = "/home/lijiawei/extracted_images"
 output_json = "/home/lijiawei/model/processed_results.json"
 
 
 supported_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')
 
 
 image_paths = [
 os.path.join(input_folder, img)
 for img in os.listdir(input_folder)
 if img.lower().endswith(supported_extensions)
 ]
 
 print(f"找到 {len(image_paths)} 张图片。")
 
 
 results = []
 
 
 def process_single_image(image_path):
 try:
 
 image = Image.open(image_path).convert("RGB")
 
 
 messages = [
 {
 "role": "user",
 "content": [
 {
 "type": "image",
 "image": image_path,
 },
 {"type": "text", "text": "Please describe the pedestrian in the image"},
 ],
 }
 ]
 text = processor.apply_chat_template(
 messages, tokenize=False, add_generation_prompt=True
 )
 
 
 inputs = processor(
 text=[text],
 images=[image],
 padding=True,
 return_tensors="pt"
 )
 
 
 inputs = {k: v.to(device) for k, v in inputs.items()}
 
 
 with torch.no_grad():
 generated_ids = model.generate(**inputs, max_new_tokens=1024)
 
 
 generated_ids_trimmed = [
 generated_ids[i][len(inputs['input_ids'][i]):]
 for i in range(len(generated_ids))
 ]
 
 
 output_text = processor.batch_decode(
 generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )
 
 
 description = output_text[0] if output_text else "No description generated."
 
 
 del inputs, generated_ids, generated_ids_trimmed, output_text
 torch.cuda.empty_cache()
 
 return {
 "image_path": image_path,
 "description": description
 }
 
 except Exception as e:
 print(f"Error processing {image_path}: {e}")
 return {
 "image_path": image_path,
 "description": "Processing Failed"
 }
 
 
 for image_path in tqdm(image_paths, desc="Processing Images"):
 result = process_single_image(image_path)
 results.append(result)
 
 
 with open(output_json, 'w', encoding='utf-8') as f:
 json.dump(results, f, ensure_ascii=False, indent=4)
 
 print(f"批量图片处理完成,结果已保存到 {output_json}")
 
 
 |