Spaces:

KangLiao
/

Puffin

Running on Zero

App Files Files Community

KangLiao commited on Oct 9

Commit

2788c5c

1 Parent(s): eeb3295

init

Browse files

Files changed (8) hide show

app.py +0 -2
configs/models/qwen2_5_1_5b_radio_sd3_dynamic_puffin.py +6 -5
configs/qwen2.5/config.json +27 -0
configs/qwen2.5/generation_config.json +14 -0
configs/qwen2.5/tokenizer.json +0 -0
configs/qwen2.5/tokenizer_config.json +207 -0
configs/qwen2.5/vocab.json +0 -0
configs/radio3/config.json +241 -0

app.py CHANGED Viewed

@@ -135,7 +135,6 @@ def generate_image(prompt_scene,
     prompt = prompt_scene + " " + prompt_camera
     bsz = 4
     with torch.no_grad():
         images, output_reasoning = model.generate(
@@ -159,7 +158,6 @@ def generate_image(prompt_scene,
         return ret_images
 # Gradio interface
 css = '''
 .gradio-container {max-width: 960px !important}

     prompt = prompt_scene + " " + prompt_camera
     bsz = 4
     with torch.no_grad():
         images, output_reasoning = model.generate(
         return ret_images
 # Gradio interface
 css = '''
 .gradio-container {max-width: 960px !important}

configs/models/qwen2_5_1_5b_radio_sd3_dynamic_puffin.py CHANGED Viewed

@@ -6,8 +6,9 @@ from src.models.radiov3.hf_model import RADIOModel
 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
 from transformers import AutoModelForCausalLM, AutoTokenizer
-llm_name_or_path = 'Qwen/Qwen2.5-1.5B-Instruct'
 sd3_model_name_or_path = "configs/sd3"
 prompt_template = dict(
     SYSTEM=('<|im_start|>system\n{system}<|im_end|>\n'),
@@ -68,14 +69,14 @@ model = dict(type=Qwen2p5RadioStableDiffusion3HFDynamic,
              freeze_visual_encoder=True,
              freeze_llm=True,
              llm=dict(
-                 type=AutoModelForCausalLM.from_pretrained,
                  pretrained_model_name_or_path=llm_name_or_path,
                  torch_dtype=torch.bfloat16,
                  #local_files_only=True,
                  #attn_implementation='flash_attention_2',
              ),
              tokenizer=dict(
-                 type=AutoTokenizer.from_pretrained,
                  pretrained_model_name_or_path=llm_name_or_path,
                  #local_files_only=True,
                  ),
@@ -83,8 +84,8 @@ model = dict(type=Qwen2p5RadioStableDiffusion3HFDynamic,
              pretrained_pth=None,
              use_activation_checkpointing=False,
              visual_encoder=dict(
-                 type=RADIOModel.from_pretrained,
-                 pretrained_model_name_or_path="nvidia/C-RADIOv3-H",
                  torch_dtype=torch.bfloat16,
                  #local_files_only=True,
                  ),

 from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler
 from transformers import AutoModelForCausalLM, AutoTokenizer
+llm_name_or_path = 'configs/qwen2.5'
 sd3_model_name_or_path = "configs/sd3"
+radiov3_model_name_or_path = "configs/radiov3"
 prompt_template = dict(
     SYSTEM=('<|im_start|>system\n{system}<|im_end|>\n'),
              freeze_visual_encoder=True,
              freeze_llm=True,
              llm=dict(
+                 type=AutoModelForCausalLM.from_config,
                  pretrained_model_name_or_path=llm_name_or_path,
                  torch_dtype=torch.bfloat16,
                  #local_files_only=True,
                  #attn_implementation='flash_attention_2',
              ),
              tokenizer=dict(
+                 type=AutoTokenizer.from_config,
                  pretrained_model_name_or_path=llm_name_or_path,
                  #local_files_only=True,
                  ),
              pretrained_pth=None,
              use_activation_checkpointing=False,
              visual_encoder=dict(
+                 type=RADIOModel.from_config,
+                 pretrained_model_name_or_path=radiov3_model_name_or_path,
                  torch_dtype=torch.bfloat16,
                  #local_files_only=True,
                  ),

configs/qwen2.5/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.1",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

configs/qwen2.5/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "pad_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_p": 0.8,
+  "top_k": 20,
+  "transformers_version": "4.37.0"
+}

configs/qwen2.5/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

configs/qwen2.5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

configs/qwen2.5/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

configs/radio3/config.json ADDED Viewed

	@@ -0,0 +1,241 @@

+{
+  "adaptor_configs": {},
+  "adaptor_names": null,
+  "architectures": [
+    "RADIOModel"
+  ],
+  "args": {
+    "aa": null,
+    "amp": true,
+    "amp_dtype": "bfloat16",
+    "amp_impl": "native",
+    "aug_repeats": 0,
+    "aug_splits": 0,
+    "bn_eps": null,
+    "bn_momentum": null,
+    "cache_dir": null,
+    "channels_last": false,
+    "checkpoint_hist": 10,
+    "chk_keep_forever": 100,
+    "class_map": "",
+    "clip_grad": null,
+    "clip_mode": "norm",
+    "cls_token_per_teacher": true,
+    "coco_annotations_file": "/datasets/coco2017-adlsa/annotations/captions_val2017.json",
+    "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
+    "color_jitter": 0.4,
+    "cooldown_epochs": 0,
+    "cpe_max_size": 2048,
+    "cpe_num_registers": 4,
+    "crd_loss": false,
+    "crd_loss_weight": 0.8,
+    "crop_pct": null,
+    "cutmix": 0.0,
+    "cutmix_minmax": null,
+    "dataset_download": false,
+    "debug_full_knn": false,
+    "decay_epochs": 90,
+    "decay_milestones": [
+      90,
+      180,
+      270
+    ],
+    "decay_rate": 0.1,
+    "depchain": true,
+    "detect_anomaly": false,
+    "dist_bn": "reduce",
+    "dist_norm_weight": 0.0,
+    "distributed": true,
+    "drop": 0.0,
+    "drop_block": null,
+    "drop_connect": null,
+    "drop_path": null,
+    "dtype": "float32",
+    "epoch_repeats": 0.0,
+    "eval": false,
+    "eval_metric": "knn_top1",
+    "eval_teacher": false,
+    "eval_teacher_only": false,
+    "eval_throughput": false,
+    "fast_norm": false,
+    "fd_loss_fn": "MSE",
+    "feature_normalization": "PHI_STANDARDIZE",
+    "feature_summarizer": "cls_token",
+    "feature_upscale_factor": null,
+    "force_new_wandb_id": false,
+    "force_spectral_reparam": false,
+    "freeze_bn": false,
+    "fsdp": true,
+    "full_equivariance": false,
+    "fuser": "",
+    "gp": null,
+    "grad_accum_steps": 1,
+    "grad_checkpointing": false,
+    "head_init_bias": null,
+    "head_init_scale": null,
+    "head_lr": null,
+    "head_warmup": 5,
+    "head_weight_decay": 0.01,
+    "hflip": 0.5,
+    "img_size": null,
+    "in_chans": null,
+    "initial_checkpoint": null,
+    "input_size": null,
+    "interpolation": "",
+    "layer_decay": null,
+    "local_rank": 0,
+    "log_interval": 50,
+    "log_mlflow": false,
+    "log_wandb": true,
+    "loss_auto_balance": false,
+    "lr_base": 0.1,
+    "lr_base_scale": "",
+    "lr_base_size": 256,
+    "lr_cycle_decay": 0.5,
+    "lr_cycle_limit": 1,
+    "lr_cycle_mul": 1.0,
+    "lr_k_decay": 1.0,
+    "lr_noise": null,
+    "lr_noise_pct": 0.67,
+    "lr_noise_std": 1.0,
+    "mean": null,
+    "mesa": false,
+    "min_lr": 0.0001,
+    "mixup": 0.0,
+    "mixup_mode": "batch",
+    "mixup_off_epoch": 0,
+    "mixup_prob": 1.0,
+    "mixup_switch_prob": 0.5,
+    "mlp_hidden_size": 2560,
+    "mlp_num_inner": 1,
+    "mlp_version": "v2",
+    "model": "vit_huge_patch16_224",
+    "model_kwargs": {},
+    "model_norm": false,
+    "momentum": 0.9,
+    "no_aug": false,
+    "no_custom_validation": false,
+    "no_ddp_bb": true,
+    "no_knn": false,
+    "no_prefetcher": false,
+    "no_resume_opt": false,
+    "num_classes": null,
+    "one_logger_app_tag": "",
+    "one_logger_is_baseline": false,
+    "one_logger_run_name": "",
+    "onelogger": null,
+    "opt_betas": null,
+    "opt_eps": null,
+    "patience_epochs": 10,
+    "pin_mem": false,
+    "prefetcher": true,
+    "pretrained": false,
+    "rank": 0,
+    "ratio": [
+      0.75,
+      1.3333333333333333
+    ],
+    "recount": 1,
+    "recovery_interval": 0,
+    "register_multiple": 0,
+    "remode": "pixel",
+    "reprob": 0.0,
+    "reset_loss_state": true,
+    "resplit": false,
+    "sample_tracking": false,
+    "save_images": false,
+    "scale": [
+      0.5,
+      1.0
+    ],
+    "sched": "cosine",
+    "seed": 42,
+    "shift_equivariance": true,
+    "smoothing": 0.1,
+    "spectral_heads": false,
+    "spectral_reparam": false,
+    "spectral_weight_decay": null,
+    "split_bn": false,
+    "start_epoch": null,
+    "std": null,
+    "stream_teachers": true,
+    "sync_bn": false,
+    "synchronize_step": false,
+    "teachers": [
+      {
+        "fd_normalize": false,
+        "feature_distillation": true,
+        "input_size": 378,
+        "model": "ViT-H-14-378-quickgelu",
+        "name": "clip",
+        "pretrained": "dfn5b",
+        "type": "open_clip",
+        "use_summary": true
+      },
+      {
+        "fd_normalize": false,
+        "feature_distillation": true,
+        "input_size": 384,
+        "model": "siglip2-g-384",
+        "name": "siglip2-g",
+        "type": "siglip2",
+        "use_summary": true
+      },
+      {
+        "fd_normalize": false,
+        "feature_distillation": true,
+        "input_size": 224,
+        "model": "dinov2_vitg14_reg",
+        "name": "dino_v2",
+        "type": "dino_v2",
+        "use_summary": true
+      },
+      {
+        "fd_normalize": false,
+        "feature_distillation": true,
+        "input_size": 1024,
+        "model": "vit-h",
+        "name": "sam",
+        "type": "sam",
+        "use_summary": false
+      }
+    ],
+    "torchcompile": null,
+    "torchscript": false,
+    "train_interpolation": "random",
+    "train_split": "train",
+    "tta": 0,
+    "use_coco": false,
+    "use_multi_epochs_loader": false,
+    "val_ema_only": false,
+    "val_split": "val",
+    "vflip": 0.0,
+    "vitdet_version": 1,
+    "wandb_entity": "",
+    "wandb_id": "",
+    "wandb_job_type": "",
+    "wandb_name": "",
+    "wandb_project": "",
+    "warmup_lr": 1e-05,
+    "warmup_prefix": false,
+    "worker_seeding": "all",
+    "workers": 8,
+    "world_size": 256
+  },
+  "auto_map": {
+    "AutoConfig": "hf_model.RADIOConfig",
+    "AutoModel": "hf_model.RADIOModel"
+  },
+  "feature_normalizer_config": null,
+  "inter_feature_normalizer_config": null,
+  "max_resolution": 2048,
+  "patch_size": 16,
+  "preferred_resolution": [
+    512,
+    512
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "version": "c-radio_v3-h",
+  "vitdet_window_size": null
+}