Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| from __future__ import annotations | |
| import os | |
| import gradio as gr | |
| from inference_fatezero import merge_config_then_run | |
| # TITLE = '# [FateZero](http://fate-zero-edit.github.io/)' | |
| HF_TOKEN = os.getenv('HF_TOKEN') | |
| # pipe = InferencePipeline(HF_TOKEN) | |
| pipe = merge_config_then_run() | |
| # app = InferenceUtil(HF_TOKEN) | |
| with gr.Blocks(css='style.css') as demo: | |
| # gr.Markdown(TITLE) | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; max-width: 1200px; margin: 20px auto;"> | |
| <h1 style="font-weight: 900; font-size: 2rem; margin: 0rem"> | |
| FateZero : Fusing Attentions for Zero-shot Text-based Video Editing | |
| </h1> | |
| <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
| <a href="https://chenyangqiqi.github.io/">Chenyang Qi</a> | |
| <a href="https://vinthony.github.io/academic/">Xiaodong Cun</a> , <a href="https://yzhang2016.github.io/">Yong Zhang</a>, | |
| <a href="https://chenyanglei.github.io">Chenyang Lei</a>, <a href="https://xinntao.github.io/"> Xintao Wang </a>, | |
| <a href="https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=zh-CN">Ying Shan</a>, | |
| <a href="http://cqf.io">Qifeng Chen</a> | |
| </h2> | |
| <h2 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
| <span class="link-block"> | |
| [<a href="https://arxiv.org/abs/2303.09535" target="_blank" | |
| class="external-link "> | |
| <span class="icon"> | |
| <i class="ai ai-arxiv"></i> | |
| </span> | |
| <span>arXiv</span> | |
| </a>] | |
| </span> | |
| <!-- Github link --> | |
| <span class="link-block"> | |
| [<a href="https://github.com/ChenyangQiQi/FateZero" target="_blank" | |
| class="external-link "> | |
| <span class="icon"> | |
| <i class="fab fa-github"></i> | |
| </span> | |
| <span>Code</span> | |
| </a>] | |
| </span> | |
| <!-- Github link --> | |
| <span class="link-block"> | |
| [<a href="http://fate-zero-edit.github.io/" target="_blank" | |
| class="external-link "> | |
| <span class="icon"> | |
| <i class="fab fa-github"></i> | |
| </span> | |
| <span>Homepage</span> | |
| </a>] | |
| </span> | |
| <!-- Github link --> | |
| <span class="link-block"> | |
| [<a href="https://hkustconnect-my.sharepoint.com/:v:/g/personal/cqiaa_connect_ust_hk/EXKDI_nahEhKtiYPvvyU9SkBDTG2W4G1AZ_vkC7ekh3ENw?e=ficp9t" target="_blank" | |
| class="external-link "> | |
| <span class="icon"> | |
| <i class="fab fa-youtube"></i> | |
| </span> | |
| <span>Video</span> | |
| </a>] | |
| </span> | |
| </h2> | |
| <h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem"> | |
| TL;DR: FateZero is the first zero-shot framework for text-driven video editing via pretrained diffusion models without training. | |
| </h2> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <p>We provide an <a href="https://github.com/ChenyangQiQi/FateZero/blob/main/docs/EditingGuidance.md"> Editing Guidance </a> to help users to choose hyperparameters when editing in-the-wild video. | |
| <p>Note that due to the limits of memory and computing resources on hugging-face, the results here are only toy examples and take longer to edit. | |
| <p>You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. | |
| <br/> | |
| <a href="https://huggingface.co/spaces/chenyangqi/FateZero?duplicate=true"> | |
| <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> | |
| <p>Alternatively, try our GitHub <a href=https://github.com/ChenyangQiQi/FateZero> code </a> on your GPU. | |
| </p>""") | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Accordion('Input Video', open=True): | |
| # user_input_video = gr.File(label='Input Source Video') | |
| user_input_video = gr.Video(label='Input Source Video', source='upload', type='numpy', format="mp4", visible=True).style(height="auto") | |
| with gr.Accordion('Temporal Crop offset and Sampling Stride', open=False): | |
| n_sample_frame = gr.Slider(label='Number of Frames', | |
| minimum=0, | |
| maximum=32, | |
| step=1, | |
| value=8) | |
| stride = gr.Slider(label='Temporal stride', | |
| minimum=0, | |
| maximum=20, | |
| step=1, | |
| value=1) | |
| start_sample_frame = gr.Number(label='Start frame in the video', | |
| value=0, | |
| precision=0) | |
| with gr.Accordion('Spatial Crop offset', open=False): | |
| left_crop = gr.Number(label='Left crop', | |
| value=0, | |
| precision=0) | |
| right_crop = gr.Number(label='Right crop', | |
| value=0, | |
| precision=0) | |
| top_crop = gr.Number(label='Top crop', | |
| value=0, | |
| precision=0) | |
| bottom_crop = gr.Number(label='Bottom crop', | |
| value=0, | |
| precision=0) | |
| offset_list = [ | |
| left_crop, | |
| right_crop, | |
| top_crop, | |
| bottom_crop, | |
| ] | |
| ImageSequenceDataset_list = [ | |
| start_sample_frame, | |
| n_sample_frame, | |
| stride | |
| ] + offset_list | |
| model_id = gr.Dropdown( | |
| label='Model ID', | |
| choices=[ | |
| 'CompVis/stable-diffusion-v1-4', | |
| # add shape editing ckpt here | |
| ], | |
| value='CompVis/stable-diffusion-v1-4') | |
| with gr.Accordion('Text Prompt', open=True): | |
| source_prompt = gr.Textbox(label='Source Prompt', | |
| info='A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.', | |
| max_lines=1, | |
| placeholder='Example: "a silver jeep driving down a curvy road in the countryside"', | |
| value='a silver jeep driving down a curvy road in the countryside') | |
| target_prompt = gr.Textbox(label='Target Prompt', | |
| info='A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")', | |
| max_lines=1, | |
| placeholder='Example: "watercolor painting of a silver jeep driving down a curvy road in the countryside"', | |
| value='watercolor painting of a silver jeep driving down a curvy road in the countryside') | |
| run_button = gr.Button('Generate') | |
| with gr.Column(): | |
| result = gr.Video(label='Result') | |
| # result.style(height=512, width=512) | |
| with gr.Accordion('FateZero Parameters for attention fusing', open=True): | |
| cross_replace_steps = gr.Slider(label='Cross-att replace steps', | |
| info='More steps, replace more cross attention to preserve semantic layout.', | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.1, | |
| value=0.7) | |
| self_replace_steps = gr.Slider(label='Self-att replace steps', | |
| info='More steps, replace more spatial-temporal self-attention to preserve geometry and motion.', | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.1, | |
| value=0.7) | |
| enhance_words = gr.Textbox(label='Enhanced words', | |
| info='Amplify the target-words cross attention', | |
| max_lines=1, | |
| placeholder='Example: "watercolor "', | |
| value='watercolor') | |
| enhance_words_value = gr.Slider(label='Target cross-att amplification', | |
| info='larger value, more elements of target words', | |
| minimum=0.0, | |
| maximum=20.0, | |
| step=1, | |
| value=10) | |
| with gr.Accordion('DDIM Parameters', open=True): | |
| num_steps = gr.Slider(label='Number of Steps', | |
| info='larger value has better editing capacity, but takes more time and memory. (50 steps may produces memory errors)', | |
| minimum=0, | |
| maximum=30, | |
| step=1, | |
| value=15) | |
| guidance_scale = gr.Slider(label='CFG Scale', | |
| minimum=0, | |
| maximum=50, | |
| step=0.1, | |
| value=7.5) | |
| with gr.Row(): | |
| from example import style_example | |
| examples = style_example | |
| gr.Examples(examples=examples, | |
| inputs=[ | |
| model_id, | |
| user_input_video, | |
| source_prompt, | |
| target_prompt, | |
| cross_replace_steps, | |
| self_replace_steps, | |
| enhance_words, | |
| enhance_words_value, | |
| num_steps, | |
| guidance_scale, | |
| user_input_video, | |
| *ImageSequenceDataset_list | |
| ], | |
| outputs=result, | |
| fn=pipe.run, | |
| cache_examples=True, | |
| # cache_examples=os.getenv('SYSTEM') == 'spaces' | |
| ) | |
| inputs = [ | |
| model_id, | |
| user_input_video, | |
| source_prompt, | |
| target_prompt, | |
| cross_replace_steps, | |
| self_replace_steps, | |
| enhance_words, | |
| enhance_words_value, | |
| num_steps, | |
| guidance_scale, | |
| user_input_video, | |
| *ImageSequenceDataset_list | |
| ] | |
| target_prompt.submit(fn=pipe.run, inputs=inputs, outputs=result) | |
| run_button.click(fn=pipe.run, inputs=inputs, outputs=result) | |
| demo.queue().launch() | |