Spaces:

mattb512
/

fastai-lesson-10-diffusers

Sleeping

App Files Files Community

mattb512 commited on Jan 28, 2024

Commit

b3d2785

1 Parent(s): c2711e7

add guidance, intermediary latents

Browse files

Files changed (2) hide show

app.py +90 -41
image_generator.py +27 -13

app.py CHANGED Viewed

@@ -2,17 +2,27 @@ import gradio as gr
 from image_generator import ImageGenerator
 import os
-ig = ImageGenerator(g=7.5)
 print(ig)
 ig.load_models()
 ig.load_scheduler()
-def call(prompt, mix_prompt, mix_ratio, negative_prompt, steps, init_image ):
-    print(f"{prompt=} {mix_prompt=} {mix_ratio=} {negative_prompt=} {steps=} {init_image=} ")
     generated_image, latents = ig.generate(
         prompt=prompt,
-        secondary_prompt=mix_prompt,
         prompt_mix_ratio=mix_ratio,
         negative_prompt=negative_prompt,
         steps=steps,
@@ -26,40 +36,79 @@ def call(prompt, mix_prompt, mix_ratio, negative_prompt, steps, init_image ):
     return generated_image, noisy_latent
-iface = gr.Interface(
-    fn=call,
-    inputs=[
-        gr.Textbox(value="a cute dog", label="Prompt", info="primary prompt used to generate an image"),
-        gr.Textbox(value=None, label="Secondary Prompt",  info="secondary prompt to mix with the primary embeddings"),
-        gr.Slider(0, 1, value=0.5, label="Mix Ratio", info="mix ratio between primary and secondary prompt. 0 = primary only. 1 = secondary only"),
-        gr.Textbox(value=None, label="Negative Prompt", info="remove certain aspect from the picture"),
-        gr.Slider(10, 50, value=30, step=1, label="Generation Steps", info="How many steps are used to generate the picture"),
-        gr.Image(type="pil", value=None, label="Starting Image",), # info="starting image from this image as opposed to random noise"
-        ],
-    outputs=[
-        gr.Image(type="pil", label="Generated Image",),
-        gr.Image(type="pil", label="Starting Image with Added Noise" ) ],
-    examples=[
-        # simple prompt
-        ["a cute dog", "", 0.3, "", 5, None],
-        # # negative prompt
-        # ["a beautiful tree", None, None, "green", 5, None],
-        # ["a dancer, high resolution, 4k", None, None, None, 5, None],
-        # # with base image
-        # ["a painting of Paris at night in the style of Monet", None, None, None, 5, os.path.join( os.path.dirname(__file__), "examples/ex4.jpg")],
-        # ["p1", None, 0.3, None, 5, None],
-        # ["p1", None, 0.3, None, 5, None],
-        # ["p1", None, 0.3, None, 5, None],
-        # ["p1", None, 0.3, None, 5, None],
-        # ["p1", None, 0.3, None, 5, None],
-        # ["p1", None, 0.3, None, 5, None],
-        # ["p1", None, 0.3, None, 5, None],
-        ]
-    )
-# [(os.path.join(os.path.dirname(__file__), f"examples/ex{x}.jpg")) for x in range(1,11)]
-iface.launch()

 from image_generator import ImageGenerator
 import os
+header = """Hi! This HuggingFace Space is a demo for the homework from the [10th lesson](https://course.fast.ai/Lessons/lesson10.html) of the fast.ai course. You can pick some of the examples below and click the "Generate Image" Button.
+The code demonstrates:
+* how to use an existing image as a starting point for the output image generation, in addition to the prompt
+* how to use negative prompt
+* how to capture latents through the generation
+* how to mix prompt embeddings"""
+ig = ImageGenerator()
 print(ig)
 ig.load_models()
 ig.load_scheduler()
+def call(prompt, secondary_prompt, mix_ratio, negative_prompt, steps, init_image ):
+    print(f"{prompt=} {secondary_prompt=} {mix_ratio=} {negative_prompt=} {steps=} {init_image=} ")
     generated_image, latents = ig.generate(
         prompt=prompt,
+        secondary_prompt=secondary_prompt,
         prompt_mix_ratio=mix_ratio,
         negative_prompt=negative_prompt,
         steps=steps,
     return generated_image, noisy_latent
+def update_noisy_image_visibility(init_image):
+    if init_image is None:
+        print("update_noisy_image_visibility: hide noisy image")
+        return gr.Image(type="pil", label="Starting Image with Added Noise", visible=False)
+    else:
+        print("update_noisy_image_visibility: show noisy image")
+        return gr.Image(type="pil", label="Starting Image with Added Noise", visible=True)
+def run_inference(prompt="", secondary_prompt="", mix_ratio=0.5, negative_prompt="", guidance=7.5, steps=10, init_image=None, progress=gr.Progress()): #, mix_ratio, negative_prompt, steps, starting_image, load_set_btn,
+    print(f"{prompt=} {secondary_prompt=} {mix_ratio=} {negative_prompt=} {steps=} {init_image=} ")
+    generated_image, latents = ig.generate(
+        prompt=prompt,
+        secondary_prompt=secondary_prompt,
+        prompt_mix_ratio=mix_ratio,
+        negative_prompt=negative_prompt,
+        guidance=guidance,
+        steps=steps,
+        init_image=init_image,
+        latent_callback_mod=1,
+        progress_tqdm=progress.tqdm )
+    if init_image is not None:
+        noisy_latent = latents[1]
+    else:
+        noisy_latent = None
+    return generated_image, noisy_latent, ig.image_grid(latents)
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown(value=header)
+    with gr.Row():
+        with gr.Column(scale=1):
+            prompt = gr.Textbox(value="a cute dog", label="Prompt", info="primary prompt used to generate an image")
+            secondary_prompt = gr.Textbox(value=None, label="Secondary Prompt",  info="secondary prompt to mix with the primary embeddings")
+            mix_ratio = gr.Slider(0, 1, value=0.5, label="Mix Ratio", info="mix ratio between primary and secondary prompt. 0 = primary only. 1 = secondary only")
+            negative_prompt = gr.Textbox(value=None, label="Negative Prompt", info="remove certain aspect from the picture")
+            guidance = gr.Slider(0, 14, value=7.5, label="Guidance", info="how closely the model should follow the prompt (higher the closer)")
+            steps = gr.Slider(10, 50, value=10, step=1, label="Generation Steps", info="How many steps are used to generate the picture")
+            init_image = gr.Image(type="pil", value=None, label="Starting Image",) # info="starting image from this image as opposed to random noise"
+            generate_image_btn = gr.Button("Generate Image")
+        with gr.Column(scale=1):
+            output_image = gr.Image(type="pil", label="Generated Image",)
+            noisy_image = gr.Image(type="pil", label="Starting Image with Added Noise", visible=False)
+            noisy_image.change(fn=update_noisy_image_visibility, inputs=init_image, outputs=noisy_image)
+            latent_images = gr.Image(type="pil", label="Latents through the denoising process", visible=True)
+    with gr.Row():
+        # broken images should be fixed soon https://github.com/gradio-app/gradio/issues/5067
+        gr.Examples(
+            examples=[
+                # simple prompt
+                ["a cute dog", "", "", "", 7.5, 10, None],
+                # negative prompt
+                ["a beautiful tree", "", "", "green", 7.5, 10, None],
+                # with base image
+                ["a painting of Paris at night in the style of Pierre Auguste Renoir", "", "", "", 7.5, 50, os.path.join( os.path.dirname(__file__), "examples/ex4.jpg")],
+                # with prompt
+                ["a sloth", "a jaguar", 0.5, "", 7.5, 30, None],
+            ],
+            inputs=[prompt, secondary_prompt, mix_ratio, negative_prompt, guidance, steps, init_image],
+            outputs=[output_image, noisy_image, latent_images],
+            fn=run_inference,
+            cache_examples=False)
+    generate_image_btn.click(
+        fn=run_inference,
+        inputs=[prompt, secondary_prompt, mix_ratio, negative_prompt, guidance, steps, init_image],
+        outputs=[output_image, noisy_image, latent_images])
+demo.launch()

image_generator.py CHANGED Viewed

@@ -19,11 +19,8 @@ from tqdm.auto import tqdm
 logging.disable(logging.WARNING)
 class ImageGenerator():
-    def __init__(self,
-                 g:int=7.5,
-):
         self.latent_images = []
-        self.g = g
         self.width = 512
         self.height = 512
         self.generator = torch.manual_seed(32)
@@ -31,12 +28,23 @@ class ImageGenerator():
         if torch.cuda.is_available():
             self.device = torch.device("cuda")
             self.float_size = torch.float16
         else:
             self.device = torch.device("cpu")
             self.float_size = torch.float32
     def __repr__(self):
-        return f"Image Generator with {self.g=}"
     def load_models(self):
         self.tokenizer    = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=self.float_size)
@@ -48,7 +56,6 @@ class ImageGenerator():
     def load_scheduler( self,
                         beta_start : float=0.00085,
                         beta_end : float=0.012,
-                        beta_schedule : str="scaled_linear",
                         num_train_timesteps :int=1000):
         self.scheduler = LMSDiscreteScheduler(
@@ -63,10 +70,11 @@ class ImageGenerator():
     def pil_to_latent(self, image: Image) -> torch.Tensor:
         with torch.no_grad():
             np_img = np.transpose( (( np.array(image) / 255)-0.5)*2, (2,0,1)) # turn pil image into np array with values between -1 and 1
             # print(f"{np_img.shape=}") # 4, 64, 64
-            np_images = np.repeat(np_img[np.newaxis, :, :], self.bs, axis=0) # adding a new dimension and repeating the image for each prompt
             # print(f"{np_images.shape=}")
             decoded_latent = torch.from_numpy(np_images).to(self.device).float() #<-- stability-ai vae uses half(), compvis vae uses float?
@@ -96,6 +104,7 @@ class ImageGenerator():
         return Image.fromarray((image*255).round().astype("uint8"))
     def image_grid(self, imgs: [Image]) -> Image:
         w,h = imgs[0].size
         cols = len(imgs)
         grid = Image.new('RGB', size=(cols*w, h))
@@ -125,21 +134,25 @@ class ImageGenerator():
             self.latent_images.append(self.tensor_to_pil(decoded))
     def generate(self,
-                 prompt : str,
                  secondary_prompt: str=None,
                  prompt_mix_ratio : float=0.5,
                  negative_prompt="",
                  seed : int=32,
                  steps : int=30,
                  start_step_ratio : float=1/5,
                  init_image : Image=None,
-                 latent_callback_mod : int=10):
         self.latent_images = []
         if not negative_prompt: negative_prompt = ""
         with torch.no_grad():
             text = self.text_enc(prompt)
             if secondary_prompt:
                 sec_prompt_text = self.text_enc(secondary_prompt)
                 text = text * prompt_mix_ratio  + sec_prompt_text * ( 1 - prompt_mix_ratio )
             uncond = self.text_enc(negative_prompt * self.bs, text.shape[1])
@@ -157,21 +170,22 @@ class ImageGenerator():
             latents = latents * self.scheduler.init_noise_sigma
             # print(f"{latents.shape=}")
         else:
             start_steps = int(steps * start_step_ratio) # 0%: too much noise, 100% no noise
             # print(f"{start_steps=}")
-            latents =self. pil_to_latent(init_image)
             self.latent_callback(latents)
             latents = self.add_noise(latents, start_steps).to(self.device).float()
             self.latent_callback(latents)
         latents = latents.to(self.device).float()
-        for i,ts in enumerate(tqdm(self.scheduler.timesteps, leave=False)):
             if i >= start_steps:
                 inp = self.scheduler.scale_model_input(torch.cat([latents] * 2), ts)
                 with torch.no_grad():
                     u,t = self.unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2) #todo, grab those with callbacks
-                pred = u + self.g*(t-u)
                 # pred = u + self.g*(t-u)/torch.norm(t-u)*torch.norm(u)
                 latents = self.scheduler.step(pred, ts, latents).prev_sample

 logging.disable(logging.WARNING)
 class ImageGenerator():
+    def __init__(self):
         self.latent_images = []
         self.width = 512
         self.height = 512
         self.generator = torch.manual_seed(32)
         if torch.cuda.is_available():
             self.device = torch.device("cuda")
             self.float_size = torch.float16
+        elif torch.backends.mps.is_available():
+            self.device = torch.device("mps")
+            self.float_size = torch.float32
         else:
+            if not torch.backends.mps.is_built():
+                print("MPS not available because the current PyTorch install was not "
+                    "built with MPS enabled.")
+            else:
+                print("MPS not available because the current MacOS version is not 12.3+ "
+                    "and/or you do not have an MPS-enabled device on this machine.")
             self.device = torch.device("cpu")
             self.float_size = torch.float32
+        print(f"pytorch device: {self.device}")
     def __repr__(self):
+        return f"Image Generator with {self.width=} {self.height=}"
     def load_models(self):
         self.tokenizer    = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=self.float_size)
     def load_scheduler( self,
                         beta_start : float=0.00085,
                         beta_end : float=0.012,
                         num_train_timesteps :int=1000):
         self.scheduler = LMSDiscreteScheduler(
     def pil_to_latent(self, image: Image) -> torch.Tensor:
         with torch.no_grad():
+            image = image.resize(size=(self.width,self.height))
             np_img = np.transpose( (( np.array(image) / 255)-0.5)*2, (2,0,1)) # turn pil image into np array with values between -1 and 1
             # print(f"{np_img.shape=}") # 4, 64, 64
+            np_images = np.repeat(np_img[np.newaxis, :, :], self.bs, axis=0).astype(np.float32) # adding a new dimension and repeating the image for each prompt, float32 required for mac
             # print(f"{np_images.shape=}")
             decoded_latent = torch.from_numpy(np_images).to(self.device).float() #<-- stability-ai vae uses half(), compvis vae uses float?
         return Image.fromarray((image*255).round().astype("uint8"))
     def image_grid(self, imgs: [Image]) -> Image:
+        print(len(imgs))
         w,h = imgs[0].size
         cols = len(imgs)
         grid = Image.new('RGB', size=(cols*w, h))
             self.latent_images.append(self.tensor_to_pil(decoded))
     def generate(self,
+                 prompt : str="",
                  secondary_prompt: str=None,
                  prompt_mix_ratio : float=0.5,
                  negative_prompt="",
                  seed : int=32,
+                 guidance :float=7.5,
                  steps : int=30,
                  start_step_ratio : float=1/5,
                  init_image : Image=None,
+                 latent_callback_mod : int=10,
+                 progress_tqdm: callable=tqdm):
         self.latent_images = []
         if not negative_prompt: negative_prompt = ""
+        print(f"ImageGenerator: {prompt=} {secondary_prompt=} {prompt_mix_ratio=} {negative_prompt=} {guidance=} {steps=} {init_image=} ")
         with torch.no_grad():
             text = self.text_enc(prompt)
             if secondary_prompt:
+                print("using secondary prompt")
                 sec_prompt_text = self.text_enc(secondary_prompt)
                 text = text * prompt_mix_ratio  + sec_prompt_text * ( 1 - prompt_mix_ratio )
             uncond = self.text_enc(negative_prompt * self.bs, text.shape[1])
             latents = latents * self.scheduler.init_noise_sigma
             # print(f"{latents.shape=}")
         else:
+            print("using base image")
             start_steps = int(steps * start_step_ratio) # 0%: too much noise, 100% no noise
             # print(f"{start_steps=}")
+            latents =self.pil_to_latent(init_image)
             self.latent_callback(latents)
             latents = self.add_noise(latents, start_steps).to(self.device).float()
             self.latent_callback(latents)
         latents = latents.to(self.device).float()
+        for i,ts in enumerate(progress_tqdm(self.scheduler.timesteps, desc="Latent Generation")): #leave=False, does not work with gradio
             if i >= start_steps:
                 inp = self.scheduler.scale_model_input(torch.cat([latents] * 2), ts)
                 with torch.no_grad():
                     u,t = self.unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2) #todo, grab those with callbacks
+                pred = u + guidance*(t-u)
                 # pred = u + self.g*(t-u)/torch.norm(t-u)*torch.norm(u)
                 latents = self.scheduler.step(pred, ts, latents).prev_sample