zhiyucheng commited on
Commit
bc3d4ad
·
verified ·
1 Parent(s): 11591f7

Delete quick_test_video.py

Browse files
Files changed (1) hide show
  1. quick_test_video.py +0 -83
quick_test_video.py DELETED
@@ -1,83 +0,0 @@
1
- import torch
2
- from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoImageProcessor, AutoProcessor
3
- from PIL import Image
4
-
5
- import video_io
6
-
7
-
8
- model_path = "/lustre/fsw/portfolios/llmservice/users/charlwang/vlm-hf-code/_ga_ckpt/iter200_hf"
9
- device = "cuda:0"
10
- model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device_map=device, torch_dtype=torch.bfloat16).eval()
11
- tokenizer = AutoTokenizer.from_pretrained(model_path)
12
- config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
13
- image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
14
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
15
-
16
- generation_config = dict(max_new_tokens=1024, do_sample=False, eos_token_id=tokenizer.eos_token_id)
17
-
18
-
19
- video_path = "images/demo.mp4"
20
- video_fps = 1
21
- video_nframe = 8
22
- video_nframe_max = -1
23
-
24
- # Get frames and metadata
25
- image_urls, metadata = video_io.maybe_path_or_url_to_data_urls(
26
- video_path,
27
- fps=max(0, int(video_fps)),
28
- nframe=max(0, int(video_nframe)),
29
- nframe_max=int(video_nframe_max),
30
- )
31
- frames = [video_io.pil_image_from_base64(image_url) for image_url in image_urls]
32
-
33
- print(f"Metadata: {metadata}")
34
-
35
- messages = [
36
- {
37
- "role": "system",
38
- "content": "/no_think"
39
- },
40
- {
41
- "role": "user",
42
- "content": [
43
- {
44
- "type": "video",
45
- "video": f"file://{video_path}",
46
- },
47
- {
48
- "type": "text",
49
- "text": "\nDescribe what you see.",
50
- },
51
- ],
52
- }
53
- ]
54
- prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
55
-
56
- # Process with FPS metadata
57
- if metadata:
58
- inputs = processor(
59
- text=[prompt],
60
- videos=frames,
61
- videos_kwargs={'video_metadata': metadata},
62
- return_tensors="pt",
63
- )
64
- else:
65
- inputs = processor(
66
- text=[prompt],
67
- videos=frames,
68
- return_tensors="pt",
69
- )
70
- inputs = inputs.to(device)
71
-
72
- # Inference: Generation of the output
73
- model.video_pruning_rate = 0.75
74
- generated_ids = model.generate(
75
- pixel_values_videos=inputs.pixel_values_videos,
76
- input_ids=inputs.input_ids,
77
- attention_mask=inputs.attention_mask,
78
- max_new_tokens=128,
79
- )
80
- output_text = processor.batch_decode(
81
- generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
82
- )
83
- print(f"Prompt: {prompt}\nOutput: {output_text[0]}\n\n\n")