Spaces:

songweig
/

rich-text-to-image

Runtime error

App Files Files Community

songweig commited on Apr 30, 2023

Commit

41fdef7

1 Parent(s): 757e20b

update token map

Browse files

Files changed (6) hide show

app.py +202 -100
models/attention.py +20 -8
models/region_diffusion.py +222 -31
models/unet_2d_blocks.py +244 -59
utils/attention_utils.py +147 -25
utils/richtext_utils.py +8 -8

app.py CHANGED Viewed

@@ -22,18 +22,17 @@ from share_btn import community_icon_html, loading_icon_html, share_js, css
 help_text = """
 If you are encountering an error or not achieving your desired outcome, here are some potential reasons and recommendations to consider:
 1. If you format only a portion of a word rather than the complete word, an error may occur.
-2. The token map may not always accurately capture the region of the formatted tokens. If you're experiencing this problem, experiment with selecting more or fewer tokens to expand or reduce the area covered by the token maps.
-3. If you use font color and get completely corrupted results, you may consider decrease the color weight lambda.
-4. Consider using a different seed.
 """
 canvas_html = """<iframe id='rich-text-root' style='width:100%' height='360px' src='file=rich-text-to-json-iframe.html' frameborder='0' scrolling='no'></iframe>"""
 get_js_data = """
-async (text_input, negative_prompt, height, width, seed, steps, guidance_weight, color_guidance_weight, rich_text_input) => {
   const richEl = document.getElementById("rich-text-root");
   const data = richEl? richEl.contentDocument.body._data : {};
-  return [text_input, negative_prompt, height, width, seed, steps, guidance_weight, color_guidance_weight, JSON.stringify(data)];
 }
 """
 set_js_data = """
@@ -71,9 +70,13 @@ def main():
         width: int,
         seed: int,
         steps: int,
         guidance_weight: float,
         color_guidance_weight: float,
-        rich_text_input: str
     ):
         run_dir = 'results/'
         # Load region diffusion model.
@@ -88,7 +91,7 @@ def main():
         # parse json to span attributes
         base_text_prompt, style_text_prompts, footnote_text_prompts, footnote_target_tokens,\
             color_text_prompts, color_names, color_rgbs, size_text_prompts_and_sizes, use_grad_guidance = parse_json(
-                json.loads(text_input), device)
         # create control input for region diffusion
         region_text_prompts, region_target_token_ids, base_tokens = get_region_diffusion_input(
@@ -108,7 +111,7 @@ def main():
         # get token maps from plain text to image generation.
         begin_time = time.time()
         if model.attention_maps is None:
-            model.register_evaluation_hooks()
         else:
             model.reset_attention_maps()
         plain_img = model.produce_attn_maps([base_text_prompt], [negative_text],
@@ -116,27 +119,38 @@ def main():
                                             guidance_scale=guidance_weight)
         print('time lapses to get attention maps: %.4f' %
               (time.time()-begin_time))
-        color_obj_masks, _ = get_token_maps(
-            model.attention_maps, run_dir, width//8, height//8, color_target_token_ids, seed)
-        model.masks, token_maps = get_token_maps(
-            model.attention_maps, run_dir, width//8, height//8, region_target_token_ids, seed, base_tokens)
         color_obj_masks = [transforms.functional.resize(color_obj_mask, (height, width),
                                                         interpolation=transforms.InterpolationMode.BICUBIC,
                                                         antialias=True)
                            for color_obj_mask in color_obj_masks]
         text_format_dict['color_obj_atten'] = color_obj_masks
-        model.remove_evaluation_hooks()
         # generate image from rich text
         begin_time = time.time()
         seed_everything(seed)
         rich_img = model.prompt_to_img(region_text_prompts, [negative_text],
                                        height=height, width=width, num_inference_steps=steps,
-                                       guidance_scale=guidance_weight, use_grad_guidance=use_grad_guidance,
-                                       text_format_dict=text_format_dict)
         print('time lapses to generate image from rich text: %.4f' %
               (time.time()-begin_time))
-        return [plain_img[0], rich_img[0], token_maps]
     with gr.Blocks(css=css) as demo:
         url_params = gr.JSON({}, visible=False, label="URL Params")
@@ -162,6 +176,29 @@ def main():
                     placeholder='Example: poor quality, blurry, dark, low resolution, low quality, worst quality',
                     elem_id="negative_prompt"
                 )
                 seed = gr.Slider(label='Seed',
                                  minimum=0,
                                  maximum=100000,
@@ -169,15 +206,14 @@ def main():
                                  value=6,
                                  elem_id="seed"
                                  )
-                color_guidance_weight = gr.Slider(label='Color weight lambda',
-                                                  minimum=0,
-                                                  maximum=2,
-                                                  step=0.1,
-                                                  value=0.5)
                 with gr.Accordion('Other Parameters', open=False):
                     steps = gr.Slider(label='Number of Steps',
                                       minimum=0,
-                                      maximum=100,
                                       step=1,
                                       value=41)
                     guidance_weight = gr.Slider(label='CFG weight',
@@ -206,6 +242,8 @@ def main():
                 with gr.Row():
                     plaintext_result = gr.Image(
                         label='Plain-text', elem_id="plain-text-image")
                     token_map = gr.Image(label='Token Maps')
                 with gr.Row(visible=False) as share_row:
                     with gr.Group(elem_id="share-btn-container"):
@@ -218,181 +256,238 @@ def main():
             gr.Markdown(help_text)
         with gr.Row():
-            style_examples = [
                 [
-                    '{"ops":[{"insert":"a "},{"attributes":{"font":"slabo"},"insert":"night sky filled with stars"},{"insert":" above a "},{"attributes":{"font":"roboto"},"insert":"turbulent sea with giant waves"}]}',
                     '',
-                    512,
-                    512,
                     6,
                     1,
-                    None
                 ],
                 [
-                    '{"ops":[{"insert":"a "},{"attributes":{"font":"mirza"},"insert":"beautiful garden"},{"insert":" with a "},{"attributes":{"font":"roboto"},"insert":"snow mountain in the background"},{"insert":""}]}',
                     '',
-                    512,
-                    512,
-                    3,
                     1,
-                    None
                 ],
                 [
-                    '{"ops":[{"attributes":{"link":"the awe-inspiring sky and ocean in the style of J.M.W. Turner"},"insert":"the awe-inspiring sky and sea"},{"insert":" by "},{"attributes":{"font":"mirza"},"insert":"a coast with flowers and grasses in spring"}]}',
-                    'worst quality, dark, poor quality',
-                    512,
-                    512,
-                    9,
                     1,
-                    None
                 ],
             ]
-            gr.Examples(examples=style_examples,
-                        label='Font style examples',
                         inputs=[
                             text_input,
                             negative_prompt,
-                            height,
-                            width,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
                         ],
                         outputs=[
                             plaintext_result,
                             richtext_result,
                             token_map,
                         ],
                         fn=generate,
                         # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
-            footnote_examples = [
                 [
-                    '{"ops":[{"insert":"A close-up 4k dslr photo of a "},{"attributes":{"link":"A cat wearing sunglasses and a bandana around its neck."},"insert":"cat"},{"insert":" riding a scooter. Palm trees in the background."}]}',
-                    '',
-                    512,
-                    512,
                     6,
-                    1,
-                    None
                 ],
                 [
-                    '{"ops":[{"insert":"A "},{"attributes":{"link":"kitchen island with a built-in oven and a stove with gas burners "},"insert":"kitchen island"},{"insert":" next to a "},{"attributes":{"link":"an open refrigerator stocked with fresh produce, dairy products, and beverages. "},"insert":"refrigerator"},{"insert":", by James McDonald and Joarc Architects, home, interior, octane render, deviantart, cinematic, key art, hyperrealism, sun light, sunrays, canon eos c 300, ƒ 1.8, 35 mm, 8k, medium - format print"}]}',
-                    '',
-                    512,
-                    512,
                     6,
-                    1,
-                    None
                 ],
                 [
-                    '{"ops":[{"insert":"A "},{"attributes":{"link":"Art inspired by kung fu panda, elder, asian art, volumetric lighting, dramatic scene, ultra detailed, realism, chinese"},"insert":"panda"},{"insert":" standing on a cliff by a waterfall, wildlife photography, photograph, high quality, wildlife, f 1.8, soft focus, 8k, national geographic, award - winning photograph by nick nichols"}]}',
                     '',
-                    512,
-                    512,
                     6,
                     1,
-                    None
                 ],
             ]
-            gr.Examples(examples=footnote_examples,
-                        label='Footnote examples',
                         inputs=[
                             text_input,
                             negative_prompt,
-                            height,
-                            width,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
                         ],
                         outputs=[
                             plaintext_result,
                             richtext_result,
                             token_map,
                         ],
                         fn=generate,
                         # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
-            color_examples = [
                 [
-                    '{"ops":[{"insert":"a Gothic "},{"attributes":{"color":"#b26b00"},"insert":"church"},{"insert":" in a the sunset with a beautiful landscape in the background."}]}',
                     '',
-                    512,
-                    512,
-                    6,
-                    1,
-                    None
                 ],
                 [
-                    '{"ops":[{"insert":"A mesmerizing sight that captures the beauty of a "},{"attributes":{"color":"#4775fc"},"insert":"rose"},{"insert":" blooming, close up"}]}',
-                    '',
-                    512,
-                    512,
                     9,
-                    1,
-                    None
                 ],
                 [
-                    '{"ops":[{"insert":"A "},{"attributes":{"color":"#FFD700"},"insert":"marble statue of a wolf\'s head and shoulder"},{"insert":", surrounded by colorful flowers michelangelo, detailed, intricate, full of color, led lighting, trending on artstation, 4 k, hyperrealistic, 3 5 mm, focused, extreme details, unreal engine 5, masterpiece "}]}',
                     '',
-                    512,
-                    512,
-                    5,
-                    0.6,
-                    None
                 ],
             ]
-            gr.Examples(examples=color_examples,
-                        label='Font color examples',
                         inputs=[
                             text_input,
                             negative_prompt,
-                            height,
-                            width,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
                         ],
                         outputs=[
                             plaintext_result,
                             richtext_result,
                             token_map,
                         ],
                         fn=generate,
                         # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
             size_examples = [
                 [
                     '{"ops": [{"insert": "A pizza with "}, {"attributes": {"size": "60px"}, "insert": "pineapple"}, {"insert": ", pepperoni, and mushroom on the top, 4k, photorealistic"}]}',
                     'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
-                    512,
-                    512,
                     13,
                     1,
-                    None
                 ],
                 [
                     '{"ops": [{"insert": "A pizza with pineapple, "}, {"attributes": {"size": "20px"}, "insert": "pepperoni"}, {"insert": ", and mushroom on the top, 4k, photorealistic"}]}',
                     'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
-                    512,
-                    512,
                     13,
                     1,
-                    None
                 ],
                 [
                     '{"ops": [{"insert": "A pizza with pineapple, pepperoni, and "}, {"attributes": {"size": "70px"}, "insert": "mushroom"}, {"insert": " on the top, 4k, photorealistic"}]}',
                     'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
-                    512,
-                    512,
                     13,
                     1,
-                    None
                 ],
             ]
             gr.Examples(examples=size_examples,
@@ -400,15 +495,18 @@ def main():
                         inputs=[
                             text_input,
                             negative_prompt,
-                            height,
-                            width,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
                         ],
                         outputs=[
                             plaintext_result,
                             richtext_result,
                             token_map,
                         ],
                         fn=generate,
@@ -423,11 +521,15 @@ def main():
                 width,
                 seed,
                 steps,
                 guidance_weight,
                 color_guidance_weight,
-                rich_text_input
             ],
-            outputs=[plaintext_result, richtext_result, token_map],
             _js=get_js_data
         ).then(
             fn=lambda: gr.update(visible=True), inputs=None, outputs=share_row, queue=False)

 help_text = """
 If you are encountering an error or not achieving your desired outcome, here are some potential reasons and recommendations to consider:
 1. If you format only a portion of a word rather than the complete word, an error may occur.
+2. If you use font color and get completely corrupted results, you may consider decrease the color weight lambda.
+3. Consider using a different seed.
 """
 canvas_html = """<iframe id='rich-text-root' style='width:100%' height='360px' src='file=rich-text-to-json-iframe.html' frameborder='0' scrolling='no'></iframe>"""
 get_js_data = """
+async (text_input, negative_prompt, height, width, seed, steps, num_segments, segment_threshold, inject_interval, guidance_weight, color_guidance_weight, rich_text_input, background_aug) => {
   const richEl = document.getElementById("rich-text-root");
   const data = richEl? richEl.contentDocument.body._data : {};
+  return [text_input, negative_prompt, height, width, seed, steps, num_segments, segment_threshold, inject_interval, guidance_weight, color_guidance_weight, JSON.stringify(data), background_aug];
 }
 """
 set_js_data = """
         width: int,
         seed: int,
         steps: int,
+        num_segments: int,
+        segment_threshold: float,
+        inject_interval: float,
         guidance_weight: float,
         color_guidance_weight: float,
+        rich_text_input: str,
+        background_aug: bool,
     ):
         run_dir = 'results/'
         # Load region diffusion model.
         # parse json to span attributes
         base_text_prompt, style_text_prompts, footnote_text_prompts, footnote_target_tokens,\
             color_text_prompts, color_names, color_rgbs, size_text_prompts_and_sizes, use_grad_guidance = parse_json(
+                json.loads(text_input))
         # create control input for region diffusion
         region_text_prompts, region_target_token_ids, base_tokens = get_region_diffusion_input(
         # get token maps from plain text to image generation.
         begin_time = time.time()
         if model.attention_maps is None:
+            model.register_tokenmap_hooks()
         else:
             model.reset_attention_maps()
         plain_img = model.produce_attn_maps([base_text_prompt], [negative_text],
                                             guidance_scale=guidance_weight)
         print('time lapses to get attention maps: %.4f' %
               (time.time()-begin_time))
+        seed_everything(seed)
+        color_obj_masks, segments_vis, token_maps = get_token_maps(model.selfattn_maps, model.crossattn_maps, model.n_maps, run_dir,
+                                                                   512//8, 512//8, color_target_token_ids[:-1], seed,
+                                                                   base_tokens, segment_threshold=segment_threshold, num_segments=num_segments,
+                                                                   return_vis=True)
+        seed_everything(seed)
+        model.masks, segments_vis, token_maps = get_token_maps(model.selfattn_maps, model.crossattn_maps, model.n_maps, run_dir,
+                                                               512//8, 512//8, region_target_token_ids[:-1], seed,
+                                                               base_tokens, segment_threshold=segment_threshold, num_segments=num_segments,
+                                                               return_vis=True)
         color_obj_masks = [transforms.functional.resize(color_obj_mask, (height, width),
                                                         interpolation=transforms.InterpolationMode.BICUBIC,
                                                         antialias=True)
                            for color_obj_mask in color_obj_masks]
         text_format_dict['color_obj_atten'] = color_obj_masks
+        model.remove_tokenmap_hooks()
         # generate image from rich text
         begin_time = time.time()
         seed_everything(seed)
+        if background_aug:
+            bg_aug_end = 500
+        else:
+            bg_aug_end = 1000
         rich_img = model.prompt_to_img(region_text_prompts, [negative_text],
                                        height=height, width=width, num_inference_steps=steps,
+                                       guidance_scale=guidance_weight, use_guidance=use_grad_guidance,
+                                       text_format_dict=text_format_dict, inject_selfattn=inject_interval,
+                                       bg_aug_end=bg_aug_end)
         print('time lapses to generate image from rich text: %.4f' %
               (time.time()-begin_time))
+        return [plain_img[0], rich_img[0], segments_vis, token_maps]
     with gr.Blocks(css=css) as demo:
         url_params = gr.JSON({}, visible=False, label="URL Params")
                     placeholder='Example: poor quality, blurry, dark, low resolution, low quality, worst quality',
                     elem_id="negative_prompt"
                 )
+                segment_threshold = gr.Slider(label='Token map threshold',
+                                              info='(See less area in token maps? Decrease this. See too much area? Increase this.)',
+                                              minimum=0,
+                                              maximum=1,
+                                              step=0.01,
+                                              value=0.25)
+                inject_interval = gr.Slider(label='Detail preservation',
+                                            info='(To preserve more structure from plain-text generation, increase this. To see more rich-text attributes, decrease this.)',
+                                            minimum=0,
+                                            maximum=1,
+                                            step=0.01,
+                                            value=0.)
+                color_guidance_weight = gr.Slider(label='Color weight',
+                                                  info='(To obtain more precise color, increase this, while too large value may cause artifacts.)',
+                                                  minimum=0,
+                                                  maximum=2,
+                                                  step=0.1,
+                                                  value=0.5)
+                num_segments = gr.Slider(label='Number of segments',
+                                         minimum=2,
+                                         maximum=20,
+                                         step=1,
+                                         value=9)
                 seed = gr.Slider(label='Seed',
                                  minimum=0,
                                  maximum=100000,
                                  value=6,
                                  elem_id="seed"
                                  )
+                background_aug = gr.Checkbox(
+                    label='Precise region alignment',
+                    info='(For strict region alignment, select this option, but beware of potential artifacts when using with style.)',
+                    value=True)
                 with gr.Accordion('Other Parameters', open=False):
                     steps = gr.Slider(label='Number of Steps',
                                       minimum=0,
+                                      maximum=500,
                                       step=1,
                                       value=41)
                     guidance_weight = gr.Slider(label='CFG weight',
                 with gr.Row():
                     plaintext_result = gr.Image(
                         label='Plain-text', elem_id="plain-text-image")
+                    segments = gr.Image(label='Segmentation')
+                with gr.Row():
                     token_map = gr.Image(label='Token Maps')
                 with gr.Row(visible=False) as share_row:
                     with gr.Group(elem_id="share-btn-container"):
             gr.Markdown(help_text)
         with gr.Row():
+            footnote_examples = [
                 [
+                    '{"ops":[{"insert":"A close-up 4k dslr photo of a "},{"attributes":{"link":"A cat wearing sunglasses and a bandana around its neck."},"insert":"cat"},{"insert":" riding a scooter. Palm trees in the background."}]}',
                     '',
+                    5,
+                    0.3,
+                    0,
                     6,
                     1,
+                    None,
+                    True
                 ],
                 [
+                    '{"ops":[{"insert":"A "},{"attributes":{"link":"kitchen island with a stove with gas burners and a built-in oven "},"insert":"kitchen island"},{"insert":" next to a "},{"attributes":{"link":"an open refrigerator stocked with fresh produce, dairy products, and beverages. "},"insert":"refrigerator"},{"insert":", by James McDonald and Joarc Architects, home, interior, octane render, deviantart, cinematic, key art, hyperrealism, sun light, sunrays, canon eos c 300, ƒ 1.8, 35 mm, 8k, medium - format print"}]}',
                     '',
+                    6,
+                    0.5,
+                    0,
+                    6,
                     1,
+                    None,
+                    True
                 ],
                 [
+                    '{"ops":[{"insert":"A "},{"attributes":{"link":"Happy Kung fu panda art, elder, asian art, volumetric lighting, dramatic scene, ultra detailed, realism, chinese"},"insert":"panda"},{"insert":" standing on a cliff by a waterfall, wildlife photography, photograph, high quality, wildlife, f 1.8, soft focus, 8k, national geographic, award - winning photograph by nick nichols"}]}',
+                    '',
+                    4,
+                    0.3,
+                    0,
+                    4,
                     1,
+                    None,
+                    True
                 ],
             ]
+            gr.Examples(examples=footnote_examples,
+                        label='Footnote examples',
                         inputs=[
                             text_input,
                             negative_prompt,
+                            num_segments,
+                            segment_threshold,
+                            inject_interval,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
+                            background_aug,
                         ],
                         outputs=[
                             plaintext_result,
                             richtext_result,
+                            segments,
                             token_map,
                         ],
                         fn=generate,
                         # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
+            color_examples = [
                 [
+                    '{"ops":[{"insert":"a beautifule girl with big eye, skin, and long "},{"attributes":{"color":"#00ffff"},"insert":"hair"},{"insert":", t-shirt, bursting with vivid color, intricate, elegant, highly detailed, photorealistic, digital painting,  artstation, illustration, concept art."}]}',
+                    'lowres, had anatomy, bad hands, cropped, worst quality',
+                    9,
+                    0.25,
+                    0.3,
                     6,
+                    0.5,
+                    None,
+                    True
                 ],
                 [
+                    '{"ops":[{"insert":"a beautifule girl with big eye, skin, and long "},{"attributes":{"color":"#eeeeee"},"insert":"hair"},{"insert":", t-shirt, bursting with vivid color, intricate, elegant, highly detailed, photorealistic, digital painting,  artstation, illustration, concept art."}]}',
+                    'lowres, had anatomy, bad hands, cropped, worst quality',
+                    9,
+                    0.25,
+                    0.3,
                     6,
+                    0.1,
+                    None,
+                    True
                 ],
                 [
+                    '{"ops":[{"insert":"a Gothic "},{"attributes":{"color":"#FD6C9E"},"insert":"church"},{"insert":" in a the sunset with a beautiful landscape in the background."}]}',
                     '',
+                    5,
+                    0.3,
+                    0.3,
                     6,
+                    0.5,
+                    None,
+                    False
+                ],
+                [
+                    '{"ops":[{"insert":"A mesmerizing sight that captures the beauty of a "},{"attributes":{"color":"#4775fc"},"insert":"rose"},{"insert":" blooming, close up"}]}',
+                    '',
+                    3,
+                    0.3,
+                    0,
+                    9,
                     1,
+                    None,
+                    False
+                ],
+                [
+                    '{"ops":[{"insert":"A "},{"attributes":{"color":"#FFD700"},"insert":"marble statue of a wolf\'s head and shoulder"},{"insert":", surrounded by colorful flowers michelangelo, detailed, intricate, full of color, led lighting, trending on artstation, 4 k, hyperrealistic, 3 5 mm, focused, extreme details, unreal engine 5, masterpiece "}]}',
+                    '',
+                    5,
+                    0.3,
+                    0,
+                    5,
+                    0.6,
+                    None,
+                    False
                 ],
             ]
+            gr.Examples(examples=color_examples,
+                        label='Font color examples',
                         inputs=[
                             text_input,
                             negative_prompt,
+                            num_segments,
+                            segment_threshold,
+                            inject_interval,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
+                            background_aug,
                         ],
                         outputs=[
                             plaintext_result,
                             richtext_result,
+                            segments,
                             token_map,
                         ],
                         fn=generate,
                         # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
+            style_examples = [
                 [
+                    '{"ops":[{"insert":"a "},{"attributes":{"font":"mirza"},"insert":"beautiful garden"},{"insert":" with a "},{"attributes":{"font":"roboto"},"insert":"snow mountain in the background"},{"insert":""}]}',
                     '',
+                    5,
+                    0.3,
+                    0.2,
+                    3,
+                    0.5,
+                    None,
+                    False
                 ],
                 [
+                    '{"ops":[{"attributes":{"link":"the awe-inspiring sky and ocean in the style of J.M.W. Turner"},"insert":"the awe-inspiring sky and sea"},{"insert":" by "},{"attributes":{"font":"mirza"},"insert":"a coast with flowers and grasses in spring"}]}',
+                    'worst quality, dark, poor quality',
+                    5,
+                    0.3,
+                    0,
                     9,
+                    0.5,
+                    None,
+                    False
                 ],
                 [
+                    '{"ops":[{"insert":"a "},{"attributes":{"font":"slabo"},"insert":"night sky filled with stars"},{"insert":" above a "},{"attributes":{"font":"roboto"},"insert":"turbulent sea with giant waves"}]}',
                     '',
+                    2,
+                    0.4,
+                    0,
+                    6,
+                    0.5,
+                    None,
+                    False
                 ],
             ]
+            gr.Examples(examples=style_examples,
+                        label='Font style examples',
                         inputs=[
                             text_input,
                             negative_prompt,
+                            num_segments,
+                            segment_threshold,
+                            inject_interval,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
+                            background_aug,
                         ],
                         outputs=[
                             plaintext_result,
                             richtext_result,
+                            segments,
                             token_map,
                         ],
                         fn=generate,
                         # cache_examples=True,
                         examples_per_page=20)
         with gr.Row():
             size_examples = [
                 [
                     '{"ops": [{"insert": "A pizza with "}, {"attributes": {"size": "60px"}, "insert": "pineapple"}, {"insert": ", pepperoni, and mushroom on the top, 4k, photorealistic"}]}',
                     'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
+                    5,
+                    0.3,
+                    0,
                     13,
                     1,
+                    None,
+                    False
                 ],
                 [
                     '{"ops": [{"insert": "A pizza with pineapple, "}, {"attributes": {"size": "20px"}, "insert": "pepperoni"}, {"insert": ", and mushroom on the top, 4k, photorealistic"}]}',
                     'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
+                    5,
+                    0.3,
+                    0,
                     13,
                     1,
+                    None,
+                    False
                 ],
                 [
                     '{"ops": [{"insert": "A pizza with pineapple, pepperoni, and "}, {"attributes": {"size": "70px"}, "insert": "mushroom"}, {"insert": " on the top, 4k, photorealistic"}]}',
                     'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
+                    5,
+                    0.3,
+                    0,
                     13,
                     1,
+                    None,
+                    False
                 ],
             ]
             gr.Examples(examples=size_examples,
                         inputs=[
                             text_input,
                             negative_prompt,
+                            num_segments,
+                            segment_threshold,
+                            inject_interval,
                             seed,
                             color_guidance_weight,
                             rich_text_input,
+                            background_aug,
                         ],
                         outputs=[
                             plaintext_result,
                             richtext_result,
+                            segments,
                             token_map,
                         ],
                         fn=generate,
                 width,
                 seed,
                 steps,
+                num_segments,
+                segment_threshold,
+                inject_interval,
                 guidance_weight,
                 color_guidance_weight,
+                rich_text_input,
+                background_aug
             ],
+            outputs=[plaintext_result, richtext_result, segments, token_map],
             _js=get_js_data
         ).then(
             fn=lambda: gr.update(visible=True), inputs=None, outputs=share_row, queue=False)

models/attention.py CHANGED Viewed

@@ -492,7 +492,7 @@ class BasicTransformerBlock(nn.Module):
         if self.only_cross_attention:
             attn_out, _ = self.attn1(
-                norm_hidden_states, context, text_format_dict=text_format_dict) + hidden_states
             hidden_states = attn_out + hidden_states
         else:
             attn_out, _ = self.attn1(norm_hidden_states)
@@ -583,7 +583,7 @@ class CrossAttention(nn.Module):
                                 head_size, seq_len, seq_len2)
         return tensor.mean(1)
-    def forward(self, hidden_states, context=None, mask=None, text_format_dict={}):
         batch_size, sequence_length, _ = hidden_states.shape
         query = self.to_q(hidden_states)
@@ -607,7 +607,7 @@ class CrossAttention(nn.Module):
             if self._slice_size is None or query.shape[0] // self._slice_size == 1:
                 # only this attention function is used
                 hidden_states, attn_probs = self._attention(
-                    query, key, value, **text_format_dict)
         # linear proj
         hidden_states = self.to_out[0](hidden_states)
@@ -625,11 +625,11 @@ class CrossAttention(nn.Module):
             alpha=self.scale,
         )
-    def _attention(self, query, key, value, word_pos=None, font_size=None,
                    **kwargs):
         attention_scores = self._qk(query, key)
-        # Font size:
         if self.is_cross_attn and word_pos is not None and font_size is not None:
             assert key.shape[1] == 77
             attention_score_exp = attention_scores.exp()
@@ -642,13 +642,25 @@ class CrossAttention(nn.Module):
         else:
             attention_probs = attention_scores.softmax(dim=-1)
-        hidden_states = torch.bmm(attention_probs, value)
         # reshape hidden_states
         hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
-        attention_probs = self.reshape_batch_dim_to_heads_and_average(
             attention_probs)
-        return hidden_states, attention_probs
     def _memory_efficient_attention_xformers(self, query, key, value):
         query = query.contiguous()

         if self.only_cross_attention:
             attn_out, _ = self.attn1(
+                norm_hidden_states, context=context, text_format_dict=text_format_dict) + hidden_states
             hidden_states = attn_out + hidden_states
         else:
             attn_out, _ = self.attn1(norm_hidden_states)
                                 head_size, seq_len, seq_len2)
         return tensor.mean(1)
+    def forward(self, hidden_states, real_attn_probs=None, context=None, mask=None, text_format_dict={}):
         batch_size, sequence_length, _ = hidden_states.shape
         query = self.to_q(hidden_states)
             if self._slice_size is None or query.shape[0] // self._slice_size == 1:
                 # only this attention function is used
                 hidden_states, attn_probs = self._attention(
+                    query, key, value, real_attn_probs, **text_format_dict)
         # linear proj
         hidden_states = self.to_out[0](hidden_states)
             alpha=self.scale,
         )
+    def _attention(self, query, key, value, real_attn_probs=None, word_pos=None, font_size=None,
                    **kwargs):
         attention_scores = self._qk(query, key)
+        # Font size V2:
         if self.is_cross_attn and word_pos is not None and font_size is not None:
             assert key.shape[1] == 77
             attention_score_exp = attention_scores.exp()
         else:
             attention_probs = attention_scores.softmax(dim=-1)
+        # compute attention output
+        if real_attn_probs is None:
+            hidden_states = torch.bmm(attention_probs, value)
+        else:
+            if isinstance(real_attn_probs, dict):
+                for pos1, pos2 in zip(real_attn_probs['inject_pos'][0], real_attn_probs['inject_pos'][1]):
+                    attention_probs[:, :,
+                                    pos2] = real_attn_probs['reference'][:, :, pos1]
+                hidden_states = torch.bmm(attention_probs, value)
+            else:
+                hidden_states = torch.bmm(real_attn_probs, value)
         # reshape hidden_states
         hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        # we also return the map averaged over heads to save memory footprint
+        attention_probs_avg = self.reshape_batch_dim_to_heads_and_average(
             attention_probs)
+        return hidden_states, [attention_probs_avg, attention_probs]
     def _memory_efficient_attention_xformers(self, query, key, value):
         query = query.contiguous()

models/region_diffusion.py CHANGED Viewed

@@ -6,6 +6,7 @@ from functools import partial
 from transformers import CLIPTextModel, CLIPTokenizer, logging
 from diffusers import AutoencoderKL, PNDMScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
 from models.unet_2d_condition import UNet2DConditionModel
 # suppress partial model loading warning
 logging.set_verbosity_error()
@@ -38,6 +39,7 @@ class RegionDiffusion(nn.Module):
         self.masks = []
         self.attention_maps = None
         self.color_loss = torch.nn.functional.mse_loss
         print(f'[INFO] loaded stable diffusion!')
@@ -79,47 +81,83 @@ class RegionDiffusion(nn.Module):
         return text_embeddings
     def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5,
-                        latents=None, use_grad_guidance=False, text_format_dict={}):
         if latents is None:
             latents = torch.randn(
                 (1, self.unet.in_channels, height // 8, width // 8), device=self.device)
         self.scheduler.set_timesteps(num_inference_steps)
         n_styles = text_embeddings.shape[0]-1
         assert n_styles == len(self.masks)
         with torch.autocast('cuda'):
             for i, t in enumerate(self.scheduler.timesteps):
                 # predict the noise residual
                 with torch.no_grad():
-                    noise_pred_uncond = self.unet(latents, t, encoder_hidden_states=text_embeddings[:1],
-                                                  text_format_dict={})['sample']
-                    noise_pred_text = None
-                    for style_i, mask in enumerate(self.masks):
-                        if style_i < len(self.masks) - 1:
-                            masked_latent = latents
-                            noise_pred_text_cur = self.unet(masked_latent, t, encoder_hidden_states=text_embeddings[style_i+1:style_i+2],
                                                             text_format_dict={})['sample']
                         else:
-                            noise_pred_text_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[style_i+1:style_i+2],
-                                                            text_format_dict=text_format_dict)['sample']
-                        if noise_pred_text is None:
-                            noise_pred_text = noise_pred_text_cur * mask
-                        else:
-                            noise_pred_text = noise_pred_text + noise_pred_text_cur*mask
                 # perform classifier-free guidance
                 noise_pred = noise_pred_uncond + guidance_scale * \
                     (noise_pred_text - noise_pred_uncond)
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents)[
-                    'prev_sample']
-                # apply gradient guidance
-                if use_grad_guidance and t < text_format_dict['guidance_start_step']:
                     with torch.enable_grad():
                         if not latents.requires_grad:
                             latents.requires_grad = True
@@ -137,7 +175,7 @@ class RegionDiffusion(nn.Module):
                             loss_total += loss
                         loss_total.backward()
                     latents = (
-                        latents - latents.grad * text_format_dict['color_guidance_weight']).detach().clone()
         return latents
@@ -162,6 +200,7 @@ class RegionDiffusion(nn.Module):
                 (text_embeddings.shape[0] // 2, self.unet.in_channels, height // 8, width // 8), device=self.device)
         self.scheduler.set_timesteps(num_inference_steps)
         with torch.autocast('cuda'):
             for i, t in enumerate(self.scheduler.timesteps):
@@ -202,8 +241,18 @@ class RegionDiffusion(nn.Module):
         return imgs
     def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50,
-                      guidance_scale=7.5, latents=None, text_format_dict={}, use_grad_guidance=False):
         if isinstance(prompts, str):
             prompts = [prompts]
@@ -215,18 +264,11 @@ class RegionDiffusion(nn.Module):
         text_embeds = self.get_text_embeds(
             prompts, negative_prompts)  # [2, 77, 768]
-        if len(text_format_dict) > 0:
-            if 'font_styles' in text_format_dict and text_format_dict['font_styles'] is not None:
-                text_format_dict['font_styles_embs'] = self.get_text_embeds_list(
-                    text_format_dict['font_styles'])  # [2, 77, 768]
-            else:
-                text_format_dict['font_styles_embs'] = None
         # else:
         latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents,
                                        num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,
-                                       use_grad_guidance=use_grad_guidance, text_format_dict=text_format_dict)  # [1, 4, 64, 64]
         # Img latents -> imgs
         imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
@@ -272,7 +314,156 @@ class RegionDiffusion(nn.Module):
         # attention_dict is a dictionary containing attention maps for every attention layer
         self.attention_maps = attention_dict
     def remove_evaluation_hooks(self):
         for hook in self.forward_hooks:
             hook.remove()
         self.attention_maps = None

 from transformers import CLIPTextModel, CLIPTokenizer, logging
 from diffusers import AutoencoderKL, PNDMScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
 from models.unet_2d_condition import UNet2DConditionModel
+from utils.attention_utils import CrossAttentionLayers, SelfAttentionLayers
 # suppress partial model loading warning
 logging.set_verbosity_error()
         self.masks = []
         self.attention_maps = None
         self.color_loss = torch.nn.functional.mse_loss
+        self.forward_replacement_hooks = []
         print(f'[INFO] loaded stable diffusion!')
         return text_embeddings
     def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5,
+                        latents=None, use_guidance=False, text_format_dict={}, inject_selfattn=0, bg_aug_end=1000):
         if latents is None:
             latents = torch.randn(
                 (1, self.unet.in_channels, height // 8, width // 8), device=self.device)
+        if inject_selfattn > 0:
+            latents_reference = latents.clone().detach()
         self.scheduler.set_timesteps(num_inference_steps)
         n_styles = text_embeddings.shape[0]-1
         assert n_styles == len(self.masks)
         with torch.autocast('cuda'):
             for i, t in enumerate(self.scheduler.timesteps):
                 # predict the noise residual
                 with torch.no_grad():
+                    # tokens without any attributes
+                    feat_inject_step = t > (1-inject_selfattn) * 1000
+                    noise_pred_uncond_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[:1],
+                                                      text_format_dict={})['sample']
+                    noise_pred_text_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[-1:],
+                                                    text_format_dict=text_format_dict)['sample']
+                    if inject_selfattn > 0:
+                        noise_pred_uncond_refer = self.unet(latents_reference, t, encoder_hidden_states=text_embeddings[:1],
                                                             text_format_dict={})['sample']
+                        self.register_selfattn_hooks(feat_inject_step)
+                        noise_pred_text_refer = self.unet(latents_reference, t, encoder_hidden_states=text_embeddings[-1:],
+                                                          text_format_dict={})['sample']
+                        self.remove_selfattn_hooks()
+                    noise_pred_uncond = noise_pred_uncond_cur * self.masks[-1]
+                    noise_pred_text = noise_pred_text_cur * self.masks[-1]
+                    # tokens with attributes
+                    for style_i, mask in enumerate(self.masks[:-1]):
+                        if t > bg_aug_end:
+                            rand_rgb = torch.rand([1, 3, 1, 1]).cuda()
+                            black_background = torch.ones(
+                                [1, 3, height, width]).cuda()*rand_rgb
+                            black_latent = self.encode_imgs(
+                                black_background)
+                            noise = torch.randn_like(black_latent)
+                            black_latent_noisy = self.scheduler.add_noise(
+                                black_latent, noise, t)
+                            masked_latent = (
+                                mask > 0.001) * latents + (mask < 0.001) * black_latent_noisy
+                            noise_pred_uncond_cur = self.unet(masked_latent, t, encoder_hidden_states=text_embeddings[:1],
+                                                              text_format_dict={})['sample']
                         else:
+                            masked_latent = latents
+                        self.register_replacement_hooks(feat_inject_step)
+                        noise_pred_text_cur = self.unet(masked_latent, t, encoder_hidden_states=text_embeddings[style_i+1:style_i+2],
+                                                        text_format_dict={})['sample']
+                        self.remove_replacement_hooks()
+                        noise_pred_uncond = noise_pred_uncond + noise_pred_uncond_cur*mask
+                        noise_pred_text = noise_pred_text + noise_pred_text_cur*mask
                 # perform classifier-free guidance
                 noise_pred = noise_pred_uncond + guidance_scale * \
                     (noise_pred_text - noise_pred_uncond)
+                if inject_selfattn > 0:
+                    noise_pred_refer = noise_pred_uncond_refer + guidance_scale * \
+                        (noise_pred_text_refer - noise_pred_uncond_refer)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_reference = self.scheduler.step(torch.cat([noise_pred, noise_pred_refer]), t,
+                                                            torch.cat([latents, latents_reference]))[
+                        'prev_sample']
+                    latents, latents_reference = torch.chunk(
+                        latents_reference, 2, dim=0)
+                else:
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents)[
+                        'prev_sample']
+                # apply guidance
+                if use_guidance and t < text_format_dict['guidance_start_step']:
                     with torch.enable_grad():
                         if not latents.requires_grad:
                             latents.requires_grad = True
                             loss_total += loss
                         loss_total.backward()
                     latents = (
+                        latents - latents.grad * text_format_dict['color_guidance_weight'] * self.masks[0]).detach().clone()
         return latents
                 (text_embeddings.shape[0] // 2, self.unet.in_channels, height // 8, width // 8), device=self.device)
         self.scheduler.set_timesteps(num_inference_steps)
+        self.remove_replacement_hooks()
         with torch.autocast('cuda'):
             for i, t in enumerate(self.scheduler.timesteps):
         return imgs
+    def encode_imgs(self, imgs):
+        # imgs: [B, 3, H, W]
+        imgs = 2 * imgs - 1
+        posterior = self.vae.encode(imgs).latent_dist
+        latents = posterior.sample() * 0.18215
+        return latents
     def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50,
+                      guidance_scale=7.5, latents=None, text_format_dict={}, use_guidance=False, inject_selfattn=0, bg_aug_end=1000):
         if isinstance(prompts, str):
             prompts = [prompts]
         text_embeds = self.get_text_embeds(
             prompts, negative_prompts)  # [2, 77, 768]
         # else:
         latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents,
                                        num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,
+                                       use_guidance=use_guidance, text_format_dict=text_format_dict,
+                                       inject_selfattn=inject_selfattn, bg_aug_end=bg_aug_end)  # [1, 4, 64, 64]
         # Img latents -> imgs
         imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
         # attention_dict is a dictionary containing attention maps for every attention layer
         self.attention_maps = attention_dict
+    def register_selfattn_hooks(self, feat_inject_step=False):
+        r"""Function for registering hooks during evaluation.
+        We mainly store activation maps averaged over queries.
+        """
+        self.selfattn_forward_hooks = []
+        def save_activations(activations, name, module, inp, out):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            # out[0] - final output of attention layer
+            # out[1] - attention probability matrix
+            if 'attn2' in name:
+                assert out[1][1].shape[-1] == 77
+                # cross attention injection
+                # activations[name] = out[1][1].detach()
+            else:
+                assert out[1][1].shape[-1] != 77
+                activations[name] = out[1][1].detach()
+        def save_resnet_activations(activations, name, module, inp, out):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            # out[0] - final output of residual layer
+            # out[1] - residual hidden feature
+            # import ipdb
+            # ipdb.set_trace()
+            assert out[1].shape[-1] == 16
+            activations[name] = out[1].detach()
+        attention_dict = collections.defaultdict(list)
+        for name, module in self.unet.named_modules():
+            leaf_name = name.split('.')[-1]
+            if 'attn' in leaf_name and feat_inject_step:
+                # Register hook to obtain outputs at every attention layer.
+                self.selfattn_forward_hooks.append(module.register_forward_hook(
+                    partial(save_activations, attention_dict, name)
+                ))
+            if name == 'up_blocks.1.resnets.1' and feat_inject_step:
+                self.selfattn_forward_hooks.append(module.register_forward_hook(
+                    partial(save_resnet_activations, attention_dict, name)
+                ))
+        # attention_dict is a dictionary containing attention maps for every attention layer
+        self.self_attention_maps_cur = attention_dict
+    def register_replacement_hooks(self, feat_inject_step=False):
+        r"""Function for registering hooks to replace self attention.
+        """
+        self.forward_replacement_hooks = []
+        def replace_activations(name, module, args):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            if 'attn1' in name:
+                modified_args = (args[0], self.self_attention_maps_cur[name])
+                return modified_args
+                # cross attention injection
+            # elif 'attn2' in name:
+            #     modified_map = {
+            #         'reference': self.self_attention_maps_cur[name],
+            #         'inject_pos': self.inject_pos,
+            #     }
+            #     modified_args = (args[0], modified_map)
+            #     return modified_args
+        def replace_resnet_activations(name, module, args):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            modified_args = (args[0], args[1],
+                             self.self_attention_maps_cur[name])
+            return modified_args
+        for name, module in self.unet.named_modules():
+            leaf_name = name.split('.')[-1]
+            if 'attn' in leaf_name and feat_inject_step:
+                # Register hook to obtain outputs at every attention layer.
+                self.forward_replacement_hooks.append(module.register_forward_pre_hook(
+                    partial(replace_activations, name)
+                ))
+            if name == 'up_blocks.1.resnets.1' and feat_inject_step:
+                # Register hook to obtain outputs at every attention layer.
+                self.forward_replacement_hooks.append(module.register_forward_pre_hook(
+                    partial(replace_resnet_activations, name)
+                ))
+    def register_tokenmap_hooks(self):
+        r"""Function for registering hooks during evaluation.
+        We mainly store activation maps averaged over queries.
+        """
+        self.forward_hooks = []
+        def save_activations(selfattn_maps, crossattn_maps, n_maps, name, module, inp, out):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            # out[0] - final output of attention layer
+            # out[1] - attention probability matrices
+            if name in n_maps:
+                n_maps[name] += 1
+            else:
+                n_maps[name] = 1
+            if 'attn2' in name:
+                assert out[1][0].shape[-1] == 77
+                if name in CrossAttentionLayers and n_maps[name] > 10:
+                    if name in crossattn_maps:
+                        crossattn_maps[name] += out[1][0].detach().cpu()[1:2]
+                    else:
+                        crossattn_maps[name] = out[1][0].detach().cpu()[1:2]
+            else:
+                assert out[1][0].shape[-1] != 77
+                if name in SelfAttentionLayers and n_maps[name] > 10:
+                    if name in crossattn_maps:
+                        selfattn_maps[name] += out[1][0].detach().cpu()[1:2]
+                    else:
+                        selfattn_maps[name] = out[1][0].detach().cpu()[1:2]
+        selfattn_maps = collections.defaultdict(list)
+        crossattn_maps = collections.defaultdict(list)
+        n_maps = collections.defaultdict(list)
+        for name, module in self.unet.named_modules():
+            leaf_name = name.split('.')[-1]
+            if 'attn' in leaf_name:
+                # Register hook to obtain outputs at every attention layer.
+                self.forward_hooks.append(module.register_forward_hook(
+                    partial(save_activations, selfattn_maps,
+                            crossattn_maps, n_maps, name)
+                ))
+        # attention_dict is a dictionary containing attention maps for every attention layer
+        self.selfattn_maps = selfattn_maps
+        self.crossattn_maps = crossattn_maps
+        self.n_maps = n_maps
+    def remove_tokenmap_hooks(self):
+        for hook in self.forward_hooks:
+            hook.remove()
+        self.selfattn_maps = None
+        self.crossattn_maps = None
+        self.n_maps = None
     def remove_evaluation_hooks(self):
         for hook in self.forward_hooks:
             hook.remove()
         self.attention_maps = None
+    def remove_replacement_hooks(self):
+        for hook in self.forward_replacement_hooks:
+            hook.remove()
+    def remove_selfattn_hooks(self):
+        for hook in self.selfattn_forward_hooks:
+            hook.remove()

models/unet_2d_blocks.py CHANGED Viewed

@@ -16,7 +16,7 @@ import torch
 from torch import nn
 from .attention import AttentionBlock, DualTransformer2DModel, Transformer2DModel
-from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, ResnetBlock2D, Upsample2D
 def get_down_block(
@@ -36,7 +36,8 @@ def get_down_block(
     use_linear_projection=False,
     only_cross_attention=False,
 ):
-    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
     if down_block_type == "DownBlock2D":
         return DownBlock2D(
             num_layers=num_layers,
@@ -64,7 +65,8 @@ def get_down_block(
         )
     elif down_block_type == "CrossAttnDownBlock2D":
         if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
         return CrossAttnDownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -147,7 +149,8 @@ def get_up_block(
     use_linear_projection=False,
     only_cross_attention=False,
 ):
-    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
     if up_block_type == "UpBlock2D":
         return UpBlock2D(
             num_layers=num_layers,
@@ -162,7 +165,8 @@ def get_up_block(
         )
     elif up_block_type == "CrossAttnUpBlock2D":
         if cross_attention_dim is None:
-            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
         return CrossAttnUpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
@@ -258,7 +262,8 @@ class UNetMidBlock2D(nn.Module):
         super().__init__()
         self.attention_type = attention_type
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         # there is always at least one resnet
         resnets = [
@@ -312,7 +317,7 @@ class UNetMidBlock2D(nn.Module):
                 hidden_states = attn(hidden_states)
             else:
                 hidden_states = attn(hidden_states, encoder_states)
-            hidden_states = resnet(hidden_states, temb)
         return hidden_states
@@ -340,7 +345,8 @@ class UNetMidBlock2DCrossAttn(nn.Module):
         self.attention_type = attention_type
         self.attn_num_head_channels = attn_num_head_channels
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         # there is always at least one resnet
         resnets = [
@@ -420,15 +426,16 @@ class UNetMidBlock2DCrossAttn(nn.Module):
     def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
         for attn in self.attentions:
-            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
     def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
                 text_format_dict={}):
-        hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states, encoder_hidden_states,
                                  text_format_dict).sample
-            hidden_states = resnet(hidden_states, temb)
         return hidden_states
@@ -502,7 +509,7 @@ class AttnDownBlock2D(nn.Module):
         output_states = ()
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
             output_states += (hidden_states,)
@@ -620,7 +627,8 @@ class CrossAttnDownBlock2D(nn.Module):
     def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
         for attn in self.attentions:
-            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
     def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
                 text_format_dict={}):
@@ -638,13 +646,15 @@ class CrossAttnDownBlock2D(nn.Module):
                     return custom_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states,
-                                          text_format_dict
                 )[0]
             else:
-                hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
                                      text_format_dict=text_format_dict).sample
@@ -723,9 +733,10 @@ class DownBlock2D(nn.Module):
                     return custom_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
             else:
-                hidden_states = resnet(hidden_states, temb)
             output_states += (hidden_states,)
@@ -789,7 +800,7 @@ class DownEncoderBlock2D(nn.Module):
     def forward(self, hidden_states):
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
@@ -861,7 +872,7 @@ class AttnDownEncoderBlock2D(nn.Module):
     def forward(self, hidden_states):
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
             hidden_states = attn(hidden_states)
         if self.downsamplers is not None:
@@ -937,8 +948,10 @@ class AttnSkipDownBlock2D(nn.Module):
                 down=True,
                 kernel="fir",
             )
-            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
-            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
         else:
             self.resnet_down = None
             self.downsamplers = None
@@ -948,7 +961,7 @@ class AttnSkipDownBlock2D(nn.Module):
         output_states = ()
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
             output_states += (hidden_states,)
@@ -1017,8 +1030,10 @@ class SkipDownBlock2D(nn.Module):
                 down=True,
                 kernel="fir",
             )
-            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
-            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
         else:
             self.resnet_down = None
             self.downsamplers = None
@@ -1028,7 +1043,7 @@ class SkipDownBlock2D(nn.Module):
         output_states = ()
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
             output_states += (hidden_states,)
         if self.downsamplers is not None:
@@ -1069,7 +1084,8 @@ class AttnUpBlock2D(nn.Module):
         self.attention_type = attention_type
         for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             resnets.append(
@@ -1100,7 +1116,8 @@ class AttnUpBlock2D(nn.Module):
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
@@ -1109,9 +1126,10 @@ class AttnUpBlock2D(nn.Module):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
         if self.upsamplers is not None:
@@ -1152,7 +1170,8 @@ class CrossAttnUpBlock2D(nn.Module):
         self.attn_num_head_channels = attn_num_head_channels
         for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             resnets.append(
@@ -1197,7 +1216,8 @@ class CrossAttnUpBlock2D(nn.Module):
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
@@ -1224,7 +1244,8 @@ class CrossAttnUpBlock2D(nn.Module):
     def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
         for attn in self.attentions:
-            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
     def forward(
         self,
@@ -1239,7 +1260,8 @@ class CrossAttnUpBlock2D(nn.Module):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
             if self.training and self.gradient_checkpointing:
@@ -1252,13 +1274,15 @@ class CrossAttnUpBlock2D(nn.Module):
                     return custom_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states,
-                                          text_format_dict
                 )[0]
             else:
-                hidden_states = resnet(hidden_states, temb)
                 hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
                                      text_format_dict=text_format_dict).sample
@@ -1290,7 +1314,8 @@ class UpBlock2D(nn.Module):
         resnets = []
         for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             resnets.append(
@@ -1311,7 +1336,8 @@ class UpBlock2D(nn.Module):
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
@@ -1322,7 +1348,8 @@ class UpBlock2D(nn.Module):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
             if self.training and self.gradient_checkpointing:
@@ -1332,9 +1359,10 @@ class UpBlock2D(nn.Module):
                     return custom_forward
-                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
             else:
-                hidden_states = resnet(hidden_states, temb)
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -1382,13 +1410,14 @@ class UpDecoderBlock2D(nn.Module):
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
     def forward(self, hidden_states):
         for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb=None)
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -1448,13 +1477,14 @@ class AttnUpDecoderBlock2D(nn.Module):
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
-            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
     def forward(self, hidden_states):
         for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb=None)
             hidden_states = attn(hidden_states)
         if self.upsamplers is not None:
@@ -1490,7 +1520,8 @@ class AttnSkipUpBlock2D(nn.Module):
         self.attention_type = attention_type
         for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             self.resnets.append(
@@ -1499,7 +1530,8 @@ class AttnSkipUpBlock2D(nn.Module):
                     out_channels=out_channels,
                     temb_channels=temb_channels,
                     eps=resnet_eps,
-                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
                     groups_out=min(out_channels // 4, 32),
                     dropout=dropout,
                     time_embedding_norm=resnet_time_scale_shift,
@@ -1536,7 +1568,8 @@ class AttnSkipUpBlock2D(nn.Module):
                 up=True,
                 kernel="fir",
             )
-            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
             self.skip_norm = torch.nn.GroupNorm(
                 num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
             )
@@ -1552,9 +1585,10 @@ class AttnSkipUpBlock2D(nn.Module):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
         hidden_states = self.attentions[0](hidden_states)
@@ -1596,7 +1630,8 @@ class SkipUpBlock2D(nn.Module):
         self.resnets = nn.ModuleList([])
         for i in range(num_layers):
-            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             self.resnets.append(
@@ -1605,7 +1640,8 @@ class SkipUpBlock2D(nn.Module):
                     out_channels=out_channels,
                     temb_channels=temb_channels,
                     eps=resnet_eps,
-                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
                     groups_out=min(out_channels // 4, 32),
                     dropout=dropout,
                     time_embedding_norm=resnet_time_scale_shift,
@@ -1633,7 +1669,8 @@ class SkipUpBlock2D(nn.Module):
                 up=True,
                 kernel="fir",
             )
-            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
             self.skip_norm = torch.nn.GroupNorm(
                 num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
             )
@@ -1649,9 +1686,10 @@ class SkipUpBlock2D(nn.Module):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
         if skip_sample is not None:
             skip_sample = self.upsampler(skip_sample)
@@ -1668,3 +1706,150 @@ class SkipUpBlock2D(nn.Module):
             hidden_states = self.resnet_up(hidden_states, temb)
         return hidden_states, skip_sample

 from torch import nn
 from .attention import AttentionBlock, DualTransformer2DModel, Transformer2DModel
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, Upsample2D
 def get_down_block(
     use_linear_projection=False,
     only_cross_attention=False,
 ):
+    down_block_type = down_block_type[7:] if down_block_type.startswith(
+        "UNetRes") else down_block_type
     if down_block_type == "DownBlock2D":
         return DownBlock2D(
             num_layers=num_layers,
         )
     elif down_block_type == "CrossAttnDownBlock2D":
         if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock2D")
         return CrossAttnDownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
     use_linear_projection=False,
     only_cross_attention=False,
 ):
+    up_block_type = up_block_type[7:] if up_block_type.startswith(
+        "UNetRes") else up_block_type
     if up_block_type == "UpBlock2D":
         return UpBlock2D(
             num_layers=num_layers,
         )
     elif up_block_type == "CrossAttnUpBlock2D":
         if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock2D")
         return CrossAttnUpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
         super().__init__()
         self.attention_type = attention_type
+        resnet_groups = resnet_groups if resnet_groups is not None else min(
+            in_channels // 4, 32)
         # there is always at least one resnet
         resnets = [
                 hidden_states = attn(hidden_states)
             else:
                 hidden_states = attn(hidden_states, encoder_states)
+            hidden_states, _ = resnet(hidden_states, temb)
         return hidden_states
         self.attention_type = attention_type
         self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(
+            in_channels // 4, 32)
         # there is always at least one resnet
         resnets = [
     def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
         for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)
     def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
                 text_format_dict={}):
+        hidden_states, _ = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states,
                                  text_format_dict).sample
+            hidden_states, _ = resnet(hidden_states, temb)
         return hidden_states
         output_states = ()
         for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states, _ = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
             output_states += (hidden_states,)
     def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
         for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)
     def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
                 text_format_dict={}):
                     return custom_forward
                 hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(
+                        attn, return_dict=False), hidden_states, encoder_hidden_states,
+                    text_format_dict
                 )[0]
             else:
+                hidden_states, _ = resnet(hidden_states, temb)
                 hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
                                      text_format_dict=text_format_dict).sample
                     return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb)
             else:
+                hidden_states, _ = resnet(hidden_states, temb)
             output_states += (hidden_states,)
     def forward(self, hidden_states):
         for resnet in self.resnets:
+            hidden_states, _ = resnet(hidden_states, temb=None)
         if self.downsamplers is not None:
             for downsampler in self.downsamplers:
     def forward(self, hidden_states):
         for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states, _ = resnet(hidden_states, temb=None)
             hidden_states = attn(hidden_states)
         if self.downsamplers is not None:
                 down=True,
                 kernel="fir",
             )
+            self.downsamplers = nn.ModuleList(
+                [FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(
+                3, out_channels, kernel_size=(1, 1), stride=(1, 1))
         else:
             self.resnet_down = None
             self.downsamplers = None
         output_states = ()
         for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states, _ = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
             output_states += (hidden_states,)
                 down=True,
                 kernel="fir",
             )
+            self.downsamplers = nn.ModuleList(
+                [FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(
+                3, out_channels, kernel_size=(1, 1), stride=(1, 1))
         else:
             self.resnet_down = None
             self.downsamplers = None
         output_states = ()
         for resnet in self.resnets:
+            hidden_states, _ = resnet(hidden_states, temb)
             output_states += (hidden_states,)
         if self.downsamplers is not None:
         self.attention_type = attention_type
         for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             resnets.append(
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            hidden_states, _ = resnet(hidden_states, temb)
             hidden_states = attn(hidden_states)
         if self.upsamplers is not None:
         self.attn_num_head_channels = attn_num_head_channels
         for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             resnets.append(
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
     def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
         for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)
     def forward(
         self,
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
             if self.training and self.gradient_checkpointing:
                     return custom_forward
                 hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(
+                        attn, return_dict=False), hidden_states, encoder_hidden_states,
+                    text_format_dict
                 )[0]
             else:
+                hidden_states, _ = resnet(hidden_states, temb)
                 hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
                                      text_format_dict=text_format_dict).sample
         resnets = []
         for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             resnets.append(
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
             if self.training and self.gradient_checkpointing:
                     return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb)
             else:
+                hidden_states, _ = resnet(hidden_states, temb)
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
     def forward(self, hidden_states):
         for resnet in self.resnets:
+            hidden_states, _ = resnet(hidden_states, temb=None)
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
         self.resnets = nn.ModuleList(resnets)
         if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
         else:
             self.upsamplers = None
     def forward(self, hidden_states):
         for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states, _ = resnet(hidden_states, temb=None)
             hidden_states = attn(hidden_states)
         if self.upsamplers is not None:
         self.attention_type = attention_type
         for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             self.resnets.append(
                     out_channels=out_channels,
                     temb_channels=temb_channels,
                     eps=resnet_eps,
+                    groups=min(resnet_in_channels +
+                               res_skip_channels // 4, 32),
                     groups_out=min(out_channels // 4, 32),
                     dropout=dropout,
                     time_embedding_norm=resnet_time_scale_shift,
                 up=True,
                 kernel="fir",
             )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(
+                3, 3), stride=(1, 1), padding=(1, 1))
             self.skip_norm = torch.nn.GroupNorm(
                 num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
             )
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            hidden_states, _ = resnet(hidden_states, temb)
         hidden_states = self.attentions[0](hidden_states)
         self.resnets = nn.ModuleList([])
         for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
             resnet_in_channels = prev_output_channel if i == 0 else out_channels
             self.resnets.append(
                     out_channels=out_channels,
                     temb_channels=temb_channels,
                     eps=resnet_eps,
+                    groups=min(
+                        (resnet_in_channels + res_skip_channels) // 4, 32),
                     groups_out=min(out_channels // 4, 32),
                     dropout=dropout,
                     time_embedding_norm=resnet_time_scale_shift,
                 up=True,
                 kernel="fir",
             )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(
+                3, 3), stride=(1, 1), padding=(1, 1))
             self.skip_norm = torch.nn.GroupNorm(
                 num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
             )
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            hidden_states, _ = resnet(hidden_states, temb)
         if skip_sample is not None:
             skip_sample = self.upsampler(skip_sample)
             hidden_states = self.resnet_up(hidden_states, temb)
         return hidden_states, skip_sample
+class ResnetBlock2D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        kernel=None,
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        self.norm1 = torch.nn.GroupNorm(
+            num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(
+                    f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = torch.nn.Linear(
+                temb_channels, time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(
+            num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        self.upsample = self.downsample = None
+        if self.up:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.upsample = partial(
+                    F.interpolate, scale_factor=2.0, mode="nearest")
+            else:
+                self.upsample = Upsample2D(in_channels, use_conv=False)
+        elif self.down:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.downsample = partial(
+                    F.avg_pool2d, kernel_size=2, stride=2)
+            else:
+                self.downsample = Downsample2D(
+                    in_channels, use_conv=False, padding=1, name="op")
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = torch.nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb, inject_states=None):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = self.upsample(input_tensor)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            input_tensor = self.downsample(input_tensor)
+            hidden_states = self.downsample(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[
+                :, :, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        if inject_states is not None:
+            output_tensor = (input_tensor + inject_states) / \
+                self.output_scale_factor
+        else:
+            output_tensor = (input_tensor + hidden_states) / \
+                self.output_scale_factor
+        return output_tensor, hidden_states

utils/attention_utils.py CHANGED Viewed

@@ -6,7 +6,46 @@ import seaborn as sns
 import torch
 import torchvision
-from pathlib import Path
 def split_attention_maps_over_steps(attention_maps):
@@ -37,7 +76,7 @@ def split_attention_maps_over_steps(attention_maps):
 def plot_attention_maps(atten_map_list, obj_tokens, save_dir, seed, tokens_vis=None):
     atten_names = ['presoftmax', 'postsoftmax', 'postsoftmax_erosion']
-    for i, (attn_map, obj_token) in enumerate(zip(atten_map_list, obj_tokens)):
         n_obj = len(attn_map)
         plt.figure()
         plt.clf()
@@ -63,6 +102,7 @@ def plot_attention_maps(atten_map_list, obj_tokens, save_dir, seed, tokens_vis=N
                 cmap=cmap, vmin=vmin, vmax=vmax
             )
             axs[tid].set_axis_off()
             if tokens_vis is not None:
                 if tid == n_obj-1:
                     axs_xlabel = 'other tokens'
@@ -79,13 +119,14 @@ def plot_attention_maps(atten_map_list, obj_tokens, save_dir, seed, tokens_vis=N
         canvas = fig.canvas
         canvas.draw()
         width, height = canvas.get_width_height()
-        img = np.frombuffer(canvas.tostring_rgb(), dtype='uint8').reshape((height, width, 3))
         fig.tight_layout()
     return img
-def get_token_maps(attention_maps, save_dir, width, height, obj_tokens, seed=0, tokens_vis=None):
     r"""Function to visualize attention maps.
     Args:
         save_dir (str): Path to save attention maps
@@ -98,25 +139,6 @@ def get_token_maps(attention_maps, save_dir, width, height, obj_tokens, seed=0,
         attention_maps
     )
-    selected_layers = [
-        # 'down_blocks.0.attentions.0.transformer_blocks.0.attn2',
-        # 'down_blocks.0.attentions.1.transformer_blocks.0.attn2',
-        'down_blocks.1.attentions.0.transformer_blocks.0.attn2',
-        # 'down_blocks.1.attentions.1.transformer_blocks.0.attn2',
-        'down_blocks.2.attentions.0.transformer_blocks.0.attn2',
-        'down_blocks.2.attentions.1.transformer_blocks.0.attn2',
-        'mid_block.attentions.0.transformer_blocks.0.attn2',
-        'up_blocks.1.attentions.0.transformer_blocks.0.attn2',
-        'up_blocks.1.attentions.1.transformer_blocks.0.attn2',
-        'up_blocks.1.attentions.2.transformer_blocks.0.attn2',
-        # 'up_blocks.2.attentions.0.transformer_blocks.0.attn2',
-        'up_blocks.2.attentions.1.transformer_blocks.0.attn2',
-        # 'up_blocks.2.attentions.2.transformer_blocks.0.attn2',
-        # 'up_blocks.3.attentions.0.transformer_blocks.0.attn2',
-        # 'up_blocks.3.attentions.1.transformer_blocks.0.attn2',
-        # 'up_blocks.3.attentions.2.transformer_blocks.0.attn2'
-    ]
     nsteps = len(attention_maps_cond)
     hw_ori = width * height
@@ -128,7 +150,7 @@ def get_token_maps(attention_maps, save_dir, width, height, obj_tokens, seed=0,
         attention_maps_cur = attention_maps_cond[step_num]
         for layer in attention_maps_cur.keys():
-            if step_num < 10 or layer not in selected_layers:
                 continue
             attention_ind = attention_maps_cur[layer].cpu()
@@ -179,7 +201,107 @@ def get_token_maps(attention_maps, save_dir, width, height, obj_tokens, seed=0,
         attention_maps_averaged_normalized[i:i+1] for i in range(attention_maps_averaged_normalized.shape[0])]
     token_maps_vis = plot_attention_maps([attention_maps_averaged, attention_maps_averaged_normalized],
-                        obj_tokens, save_dir, seed, tokens_vis)
     attention_maps_averaged_normalized = [attn_mask.unsqueeze(1).repeat(
         [1, 4, 1, 1]).cuda() for attn_mask in attention_maps_averaged_normalized]
     return attention_maps_averaged_normalized, token_maps_vis

 import torch
 import torchvision
+from sklearn.cluster import KMeans
+SelfAttentionLayers = [
+    # 'down_blocks.0.attentions.0.transformer_blocks.0.attn1',
+    # 'down_blocks.0.attentions.1.transformer_blocks.0.attn1',
+    'down_blocks.1.attentions.0.transformer_blocks.0.attn1',
+    # 'down_blocks.1.attentions.1.transformer_blocks.0.attn1',
+    'down_blocks.2.attentions.0.transformer_blocks.0.attn1',
+    'down_blocks.2.attentions.1.transformer_blocks.0.attn1',
+    'mid_block.attentions.0.transformer_blocks.0.attn1',
+    'up_blocks.1.attentions.0.transformer_blocks.0.attn1',
+    'up_blocks.1.attentions.1.transformer_blocks.0.attn1',
+    'up_blocks.1.attentions.2.transformer_blocks.0.attn1',
+    # 'up_blocks.2.attentions.0.transformer_blocks.0.attn1',
+    'up_blocks.2.attentions.1.transformer_blocks.0.attn1',
+    # 'up_blocks.2.attentions.2.transformer_blocks.0.attn1',
+    # 'up_blocks.3.attentions.0.transformer_blocks.0.attn1',
+    # 'up_blocks.3.attentions.1.transformer_blocks.0.attn1',
+    # 'up_blocks.3.attentions.2.transformer_blocks.0.attn1',
+]
+CrossAttentionLayers = [
+    # 'down_blocks.0.attentions.0.transformer_blocks.0.attn2',
+    # 'down_blocks.0.attentions.1.transformer_blocks.0.attn2',
+    'down_blocks.1.attentions.0.transformer_blocks.0.attn2',
+    # 'down_blocks.1.attentions.1.transformer_blocks.0.attn2',
+    'down_blocks.2.attentions.0.transformer_blocks.0.attn2',
+    'down_blocks.2.attentions.1.transformer_blocks.0.attn2',
+    'mid_block.attentions.0.transformer_blocks.0.attn2',
+    'up_blocks.1.attentions.0.transformer_blocks.0.attn2',
+    'up_blocks.1.attentions.1.transformer_blocks.0.attn2',
+    'up_blocks.1.attentions.2.transformer_blocks.0.attn2',
+    # 'up_blocks.2.attentions.0.transformer_blocks.0.attn2',
+    'up_blocks.2.attentions.1.transformer_blocks.0.attn2',
+    # 'up_blocks.2.attentions.2.transformer_blocks.0.attn2',
+    # 'up_blocks.3.attentions.0.transformer_blocks.0.attn2',
+    # 'up_blocks.3.attentions.1.transformer_blocks.0.attn2',
+    # 'up_blocks.3.attentions.2.transformer_blocks.0.attn2'
+]
 def split_attention_maps_over_steps(attention_maps):
 def plot_attention_maps(atten_map_list, obj_tokens, save_dir, seed, tokens_vis=None):
     atten_names = ['presoftmax', 'postsoftmax', 'postsoftmax_erosion']
+    for i, attn_map in enumerate(atten_map_list):
         n_obj = len(attn_map)
         plt.figure()
         plt.clf()
                 cmap=cmap, vmin=vmin, vmax=vmax
             )
             axs[tid].set_axis_off()
             if tokens_vis is not None:
                 if tid == n_obj-1:
                     axs_xlabel = 'other tokens'
         canvas = fig.canvas
         canvas.draw()
         width, height = canvas.get_width_height()
+        img = np.frombuffer(canvas.tostring_rgb(),
+                            dtype='uint8').reshape((height, width, 3))
         fig.tight_layout()
     return img
+def get_token_maps_deprecated(attention_maps, save_dir, width, height, obj_tokens, seed=0, tokens_vis=None):
     r"""Function to visualize attention maps.
     Args:
         save_dir (str): Path to save attention maps
         attention_maps
     )
     nsteps = len(attention_maps_cond)
     hw_ori = width * height
         attention_maps_cur = attention_maps_cond[step_num]
         for layer in attention_maps_cur.keys():
+            if step_num < 10 or layer not in CrossAttentionLayers:
                 continue
             attention_ind = attention_maps_cur[layer].cpu()
         attention_maps_averaged_normalized[i:i+1] for i in range(attention_maps_averaged_normalized.shape[0])]
     token_maps_vis = plot_attention_maps([attention_maps_averaged, attention_maps_averaged_normalized],
+                                         obj_tokens, save_dir, seed, tokens_vis)
     attention_maps_averaged_normalized = [attn_mask.unsqueeze(1).repeat(
         [1, 4, 1, 1]).cuda() for attn_mask in attention_maps_averaged_normalized]
     return attention_maps_averaged_normalized, token_maps_vis
+def get_token_maps(selfattn_maps, crossattn_maps, n_maps, save_dir, width, height, obj_tokens, kmeans_seed=0, tokens_vis=None,
+                   preprocess=False, segment_threshold=0.30, num_segments=9, return_vis=False):
+    r"""Function to visualize attention maps.
+    Args:
+        save_dir (str): Path to save attention maps
+        batch_size (int): Batch size
+        sampler_order (int): Sampler order
+    """
+    # create the segmentation mask using self-attention maps
+    resolution = 32
+    attn_maps_1024 = {8: [], 16: [], 32: []}
+    for attn_map in selfattn_maps.values():
+        resolution_map = np.sqrt(attn_map.shape[1]).astype(int)
+        attn_map = attn_map.reshape(
+            1, resolution_map, resolution_map, resolution_map**2).permute([3, 0, 1, 2])
+        attn_map = torch.nn.functional.interpolate(attn_map, (resolution, resolution),
+                                                   mode='bicubic', antialias=True)
+        attn_maps_1024[resolution_map].append(attn_map.permute([1, 2, 3, 0]).reshape(
+            1, resolution**2, resolution_map**2))
+    attn_maps_1024 = torch.cat([torch.cat(v).mean(0).cpu()
+                                for v in attn_maps_1024.values()], -1).numpy()
+    kmeans = KMeans(n_clusters=num_segments,
+                    n_init=10).fit(attn_maps_1024)
+    clusters = kmeans.labels_
+    clusters = clusters.reshape(resolution, resolution)
+    fig = plt.figure()
+    plt.imshow(clusters)
+    plt.axis('off')
+    plt.savefig(os.path.join(save_dir, 'segmentation_k%d.jpg' % (num_segments)),
+                bbox_inches='tight', pad_inches=0)
+    if return_vis:
+        canvas = fig.canvas
+        canvas.draw()
+        cav_width, cav_height = canvas.get_width_height()
+        segments_vis = np.frombuffer(canvas.tostring_rgb(),
+                                     dtype='uint8').reshape((cav_height, cav_width, 3))
+    plt.close()
+    # label the segmentation mask using cross-attention maps
+    cross_attn_maps_1024 = []
+    for attn_map in crossattn_maps.values():
+        resolution_map = np.sqrt(attn_map.shape[1]).astype(int)
+        attn_map = attn_map.reshape(
+            1, resolution_map, resolution_map, -1).permute([0, 3, 1, 2])
+        attn_map = torch.nn.functional.interpolate(attn_map, (resolution, resolution),
+                                                   mode='bicubic', antialias=True)
+        cross_attn_maps_1024.append(attn_map.permute([0, 2, 3, 1]))
+    cross_attn_maps_1024 = torch.cat(
+        cross_attn_maps_1024).mean(0).cpu().numpy()
+    normalized_span_maps = []
+    for token_ids in obj_tokens:
+        span_token_maps = cross_attn_maps_1024[:, :, token_ids.numpy()]
+        normalized_span_map = np.zeros_like(span_token_maps)
+        for i in range(span_token_maps.shape[-1]):
+            curr_noun_map = span_token_maps[:, :, i]
+            normalized_span_map[:, :, i] = (
+                curr_noun_map - np.abs(curr_noun_map.min())) / curr_noun_map.max()
+        normalized_span_maps.append(normalized_span_map)
+    foreground_token_maps = [np.zeros([clusters.shape[0], clusters.shape[1]]).squeeze(
+    ) for normalized_span_map in normalized_span_maps]
+    background_map = np.zeros([clusters.shape[0], clusters.shape[1]]).squeeze()
+    for c in range(num_segments):
+        cluster_mask = np.zeros_like(clusters)
+        cluster_mask[clusters == c] = 1.
+        is_foreground = False
+        for normalized_span_map, foreground_nouns_map, token_ids in zip(normalized_span_maps, foreground_token_maps, obj_tokens):
+            score_maps = [cluster_mask * normalized_span_map[:, :, i]
+                          for i in range(len(token_ids))]
+            scores = [score_map.sum() / cluster_mask.sum()
+                      for score_map in score_maps]
+            if max(scores) > segment_threshold:
+                foreground_nouns_map += cluster_mask
+                is_foreground = True
+        if not is_foreground:
+            background_map += cluster_mask
+    foreground_token_maps.append(background_map)
+    # resize the token maps and visualization
+    resized_token_maps = torch.cat([torch.nn.functional.interpolate(torch.from_numpy(token_map).unsqueeze(0).unsqueeze(
+        0), (height, width), mode='bicubic', antialias=True)[0] for token_map in foreground_token_maps]).clamp(0, 1)
+    resized_token_maps = resized_token_maps / \
+        (resized_token_maps.sum(0, True)+1e-8)
+    resized_token_maps = [token_map.unsqueeze(
+        0) for token_map in resized_token_maps]
+    foreground_token_maps = [token_map[None, :, :]
+                             for token_map in foreground_token_maps]
+    token_maps_vis = plot_attention_maps([foreground_token_maps, resized_token_maps], obj_tokens,
+                                         save_dir, kmeans_seed, tokens_vis)
+    resized_token_maps = [token_map.unsqueeze(1).repeat(
+        [1, 4, 1, 1]).to(attn_map.dtype).cuda() for token_map in resized_token_maps]
+    if return_vis:
+        return resized_token_maps, segments_vis, token_maps_vis
+    else:
+        return resized_token_maps

utils/richtext_utils.py CHANGED Viewed

@@ -27,7 +27,7 @@ def seed_everything(seed):
     torch.cuda.manual_seed(seed)
-def hex_to_rgb(hex_string, return_nearest_color=False, device='cuda'):
     r"""
     Covert Hex triplet to RGB triplet.
     """
@@ -40,8 +40,8 @@ def hex_to_rgb(hex_string, return_nearest_color=False, device='cuda'):
     rgb = torch.FloatTensor((red, green, blue))[None, :, None, None]/255.
     if return_nearest_color:
         nearest_color = find_nearest_color(rgb)
-        return rgb.to(device), nearest_color
-    return rgb.to(device)
 def find_nearest_color(rgb):
@@ -56,7 +56,7 @@ def find_nearest_color(rgb):
     return nearest_color
-def font2style(font, device='cuda'):
     r"""
     Convert the font name to the style name.
     """
@@ -71,7 +71,7 @@ def font2style(font, device='cuda'):
             'Akronim': 'Abstract Cubism, Pablo Picasso', }[font]
-def parse_json(json_str, device):
     r"""
     Convert the JSON string to attributes.
     """
@@ -121,7 +121,7 @@ def parse_json(json_str, device):
             if 'color' in span['attributes']:
                 use_grad_guidance = True
                 color_rgb, nearest_color = hex_to_rgb(
-                    span['attributes']['color'], True, device=device)
                 if prev_color_rgb == color_rgb:
                     prev_text_prompt = color_text_prompts[-1]
                     color_text_prompts[-1] = prev_text_prompt + \
@@ -197,8 +197,8 @@ def get_attention_control_input(model, base_tokens, size_text_prompts_and_sizes)
             word_pos.append(base_tokens.index(size_token)+1)
             font_sizes.append(font_size)
     if len(word_pos) > 0:
-        word_pos = torch.LongTensor(word_pos).to(model.device)
-        font_sizes = torch.FloatTensor(font_sizes).to(model.device)
     else:
         word_pos = None
         font_sizes = None

     torch.cuda.manual_seed(seed)
+def hex_to_rgb(hex_string, return_nearest_color=False):
     r"""
     Covert Hex triplet to RGB triplet.
     """
     rgb = torch.FloatTensor((red, green, blue))[None, :, None, None]/255.
     if return_nearest_color:
         nearest_color = find_nearest_color(rgb)
+        return rgb.cuda(), nearest_color
+    return rgb.cuda()
 def find_nearest_color(rgb):
     return nearest_color
+def font2style(font):
     r"""
     Convert the font name to the style name.
     """
             'Akronim': 'Abstract Cubism, Pablo Picasso', }[font]
+def parse_json(json_str):
     r"""
     Convert the JSON string to attributes.
     """
             if 'color' in span['attributes']:
                 use_grad_guidance = True
                 color_rgb, nearest_color = hex_to_rgb(
+                    span['attributes']['color'], True)
                 if prev_color_rgb == color_rgb:
                     prev_text_prompt = color_text_prompts[-1]
                     color_text_prompts[-1] = prev_text_prompt + \
             word_pos.append(base_tokens.index(size_token)+1)
             font_sizes.append(font_size)
     if len(word_pos) > 0:
+        word_pos = torch.LongTensor(word_pos).cuda()
+        font_sizes = torch.FloatTensor(font_sizes).cuda()
     else:
         word_pos = None
         font_sizes = None