Update README.md
Browse files
README.md
CHANGED
|
@@ -103,10 +103,10 @@ def inference(args):
|
|
| 103 |
audio_video_tensor = preprocess(audio_video_path)
|
| 104 |
else:
|
| 105 |
audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
|
| 106 |
-
question = f"Please describe the video with
|
| 107 |
|
| 108 |
# Audio Inference
|
| 109 |
-
audio_video_path = "assets/
|
| 110 |
preprocess = processor['audio' if args.modal_type == "a" else "video"]
|
| 111 |
if args.modal_type == "a":
|
| 112 |
audio_video_tensor = preprocess(audio_video_path)
|
|
@@ -115,13 +115,13 @@ def inference(args):
|
|
| 115 |
question = f"Please describe the audio."
|
| 116 |
|
| 117 |
# Video Inference
|
| 118 |
-
audio_video_path = "assets/
|
| 119 |
preprocess = processor['audio' if args.modal_type == "a" else "video"]
|
| 120 |
if args.modal_type == "a":
|
| 121 |
audio_video_tensor = preprocess(audio_video_path)
|
| 122 |
else:
|
| 123 |
audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
|
| 124 |
-
question = f"
|
| 125 |
|
| 126 |
output = mm_infer(
|
| 127 |
audio_video_tensor,
|
|
@@ -138,11 +138,12 @@ def inference(args):
|
|
| 138 |
if __name__ == "__main__":
|
| 139 |
parser = argparse.ArgumentParser()
|
| 140 |
|
| 141 |
-
parser.add_argument('--model-path', help='', required=
|
| 142 |
parser.add_argument('--modal-type', choices=["a", "v", "av"], help='', required=True)
|
| 143 |
args = parser.parse_args()
|
| 144 |
|
| 145 |
inference(args)
|
|
|
|
| 146 |
```
|
| 147 |
|
| 148 |
|
|
|
|
| 103 |
audio_video_tensor = preprocess(audio_video_path)
|
| 104 |
else:
|
| 105 |
audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
|
| 106 |
+
question = f"Please describe the video with audio information."
|
| 107 |
|
| 108 |
# Audio Inference
|
| 109 |
+
audio_video_path = "assets/bird-twitter-car.wav"
|
| 110 |
preprocess = processor['audio' if args.modal_type == "a" else "video"]
|
| 111 |
if args.modal_type == "a":
|
| 112 |
audio_video_tensor = preprocess(audio_video_path)
|
|
|
|
| 115 |
question = f"Please describe the audio."
|
| 116 |
|
| 117 |
# Video Inference
|
| 118 |
+
audio_video_path = "assets/output_v_1jgsRbGzCls.mp4"
|
| 119 |
preprocess = processor['audio' if args.modal_type == "a" else "video"]
|
| 120 |
if args.modal_type == "a":
|
| 121 |
audio_video_tensor = preprocess(audio_video_path)
|
| 122 |
else:
|
| 123 |
audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
|
| 124 |
+
question = f"What activity are the people practicing in the video?"
|
| 125 |
|
| 126 |
output = mm_infer(
|
| 127 |
audio_video_tensor,
|
|
|
|
| 138 |
if __name__ == "__main__":
|
| 139 |
parser = argparse.ArgumentParser()
|
| 140 |
|
| 141 |
+
parser.add_argument('--model-path', help='', , required=False, default='DAMO-NLP-SG/VideoLLaMA2.1-7B-AV')
|
| 142 |
parser.add_argument('--modal-type', choices=["a", "v", "av"], help='', required=True)
|
| 143 |
args = parser.parse_args()
|
| 144 |
|
| 145 |
inference(args)
|
| 146 |
+
|
| 147 |
```
|
| 148 |
|
| 149 |
|