Qwen2.5-0.5B-quantized.w8a8 / evaluate_qwen2.5_w8a16.sh

Upload folder using huggingface_hub

1a552f6 verified about 1 year ago

5.07 kB

	source ~/environments/clearml/bin/activate

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 0fe5857173ac484a89316214b14fcf96 \
	--clearml-model \
	--queue-name oneshot-a100x2 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-72B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256


	<<END
	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 6a4ecaa68a6e45ea80c62680b0a65aa0 \
	--clearml-model \
	--queue-name oneshot-a100x2 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-72B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id c67be85bc77f462c93381280019dea1d \
	--clearml-model \
	--queue-name oneshot-a100x4 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-7B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id effae214cc464181a92d5a57df10f3d6 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-0.5B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 21dd39ef3013401d84b258410647e847 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-0.5B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 324acdd4c7c4426dbdfeb29667dc4b53 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-1.5B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 82cbaa6e27c84f08ac10e9f115034b0b \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-1.5B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 337f5b50610443c7ad2a380dce8e0be8 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-3B/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id a8ebd9cae5324572906d50f95eeee5dd \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-3B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256

	python /cache/git/research/automation/evaluation_scripts/queue_lm_evaluation_harness_vllm.py \
	--model-id 5190911e94a340988dac223c252e72a2 \
	--clearml-model \
	--queue-name oneshot-a100x1 \
	--project-name "LLM quantization - W8A16/llmcompressor/Qwen2.5" \
	--task-name "Qwen2.5-7B-Instruct/openllm/vllm" \
	--benchmark-tasks openllm \
	--max-model-len 4096 \
	--add-bos-token \
	--batch-size auto \
	--enable-chunked-prefill \
	--gpu-memory-utilization 0.9 \
	--max-num-batched-tokens 256
	END