Spaces:

szymmon
/

SmolVLM_Essay_Knowledge_Distillation

Runtime error

SmolVLM_Essay_Knowledge_Distillation / app.py

f565140 10 months ago

4.25 kB

	import gradio as gr
	import torch
	from transformers import AutoProcessor, Idefics3ForConditionalGeneration
	import logging

	logger = logging.getLogger(__name__)

	class SimpleVLMInterface:
	def __init__(self):
	self.model = None
	self.processor = None
	self.initialize_model()

	def initialize_model(self):
	try:
	model_id = "HuggingFaceTB/SmolVLM-Instruct"
	self.model = Idefics3ForConditionalGeneration.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.bfloat16
	)
	self.processor = AutoProcessor.from_pretrained(model_id)
	# Load custom adapter
	adapter_path = "smolvlm-instruct-trl-sft-ChartQA"
	self.model.load_adapter(adapter_path)
	except Exception as e:
	logger.error(f"Error initializing model: {e}")
	raise

	def generate_response(
	self,
	text_input,
	image=None,
	max_tokens=512,
	temperature=0.7,
	top_p=0.95
	):
	try:
	# Prepare the multimodal message format
	message_content = []

	# Add image content if provided
	if image is not None:
	if image.mode != 'RGB':
	image = image.convert('RGB')
	message_content.append({
	'type': 'image',
	'image': image
	})

	# Add text content
	message_content.append({
	'type': 'text',
	'text': text_input
	})

	# Create the complete message structure
	messages = {
	'role': 'user',
	'content': message_content
	}

	# Apply chat template
	chat_input = self.processor.apply_chat_template(
	[messages], # Wrap in list as it expects a sequence of messages
	add_generation_prompt=True
	)

	# Prepare model inputs
	model_inputs = self.processor(
	text=chat_input,
	images=[msg['image'] for msg in message_content if msg['type'] == 'image'] if image is not None else None,
	return_tensors="pt",
	).to(self.model.device)

	# Generate response
	generated_ids = self.model.generate(
	**model_inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=True
	)

	# Process output
	trimmed_generated_ids = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
	]
	output_text = self.processor.batch_decode(
	trimmed_generated_ids,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return output_text
	except Exception as e:
	logger.error(f"Error generating response: {e}")
	return f"Error: {str(e)}"

	def create_interface():
	vlm = SimpleVLMInterface()
	with gr.Blocks(title="Simple VLM Interface") as demo:
	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="pil", label="Upload Image (optional)")
	text_input = gr.Textbox(label="Enter your text", lines=2)
	with gr.Row():
	max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens")
	temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
	top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p")
	submit_btn = gr.Button("Generate Response")
	output_text = gr.Textbox(label="Response", lines=4)

	submit_btn.click(
	fn=vlm.generate_response,
	inputs=[text_input, image_input, max_tokens, temperature, top_p],
	outputs=output_text
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()