From a7eda4a87dc091faa8e3ff8c789c5dcb9fff232e Mon Sep 17 00:00:00 2001 From: Johannes Stelzer Date: Tue, 9 Jan 2024 14:38:12 +0100 Subject: [PATCH] working version for sdxl and sdxl turbo --- diffusers_holder.py | 18 +++---- latent_blending.py | 125 ++++++++------------------------------------ 2 files changed, 29 insertions(+), 114 deletions(-) diff --git a/diffusers_holder.py b/diffusers_holder.py index 9ade4a8..06c90cb 100644 --- a/diffusers_holder.py +++ b/diffusers_holder.py @@ -780,8 +780,8 @@ class DiffusersHolder(): if __name__ == "__main__": from PIL import Image from diffusers import AutoencoderTiny - pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0" - # pretrained_model_name_or_path = "stabilityai/sdxl-turbo" + # pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0" + pretrained_model_name_or_path = "stabilityai/sdxl-turbo" pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16, variant="fp16") pipe.to("cuda") #% @@ -792,13 +792,13 @@ if __name__ == "__main__": self = DiffusersHolder(pipe) prompt1 = "photo of underwater landscape, fish, und the sea, incredible detail, high resolution" negative_prompt = "blurry, ugly, pale" - num_inference_steps = 30 - guidance_scale = 4 + num_inference_steps = 4 + guidance_scale = 0 self.set_num_inference_steps(num_inference_steps) self.guidance_scale = guidance_scale - prefix='full' + prefix='turbo' for i in range(10): self.set_negative_prompt(negative_prompt) @@ -809,7 +809,7 @@ if __name__ == "__main__": # img_refx = self.pipe(prompt=prompt1, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale)[0] - img_refx = self.run_diffusion_sd_xl_resanity(text_embeddings=text_embeddings, latents_start=latents_start, return_image=True) + img_refx = self.run_diffusion_sd_xl(text_embeddings=text_embeddings, latents_start=latents_start, return_image=True) dt_ref = time.time() - t0 img_refx.save(f"x_{prefix}_{i}.jpg") @@ -830,11 +830,7 @@ if __name__ == "__main__": # dt_dh = time.time() - t0 - """ - sth bad in call - sth bad in cond - sth bad in noise - """ + # xxxx # #%% diff --git a/latent_blending.py b/latent_blending.py index 985b3d1..08a71c5 100644 --- a/latent_blending.py +++ b/latent_blending.py @@ -584,99 +584,19 @@ class LatentBlending(): mixing_coeffs=mixing_coeffs, return_image=return_image) - def run_upscaling( - self, - dp_img: str, - depth_strength: float = 0.65, - num_inference_steps: int = 100, - nmb_max_branches_highres: int = 5, - nmb_max_branches_lowres: int = 6, - duration_single_segment=3, - fps=24, - fixed_seeds: Optional[List[int]] = None): - r""" - Runs upscaling with the x4 model. Requires that you run a transition before with a low-res model and save the results using write_imgs_transition. - Args: - dp_img: str - Path to the low-res transition path (as saved in write_imgs_transition) - depth_strength: - Determines how deep the first injection will happen. - Deeper injections will cause (unwanted) formation of new structures, - more shallow values will go into alpha-blendy land. - num_inference_steps: - Number of diffusion steps. Higher values will take more compute time. - nmb_max_branches_highres: int - Number of final branches of the upscaling transition pass. Note this is the number - of branches between each pair of low-res images. - nmb_max_branches_lowres: int - Number of input low-res images, subsampling all transition images written in the low-res pass. - Setting this number lower (e.g. 6) will decrease the compute time but not affect the results too much. - duration_single_segment: float - The duration of each high-res movie segment. You will have nmb_max_branches_lowres-1 segments in total. - fps: float - frames per second of movie - fixed_seeds: Optional[List[int)]: - You can supply two seeds that are used for the first and second keyframe (prompt1 and prompt2). - Otherwise random seeds will be taken. - """ - fp_yml = os.path.join(dp_img, "lowres.yaml") - fp_movie = os.path.join(dp_img, "movie_highres.mp4") - ms = MovieSaver(fp_movie, fps=fps) - assert os.path.isfile(fp_yml), "lowres.yaml does not exist. did you forget run_upscaling_step1?" - dict_stuff = yml_load(fp_yml) - - # load lowres images - nmb_images_lowres = dict_stuff['nmb_images'] - prompt1 = dict_stuff['prompt1'] - prompt2 = dict_stuff['prompt2'] - idx_img_lowres = np.round(np.linspace(0, nmb_images_lowres - 1, nmb_max_branches_lowres)).astype(np.int32) - imgs_lowres = [] - for i in idx_img_lowres: - fp_img_lowres = os.path.join(dp_img, f"lowres_img_{str(i).zfill(4)}.jpg") - assert os.path.isfile(fp_img_lowres), f"{fp_img_lowres} does not exist. did you forget run_upscaling_step1?" - imgs_lowres.append(Image.open(fp_img_lowres)) - - # set up upscaling - text_embeddingA = self.dh.get_text_embedding(prompt1) - text_embeddingB = self.dh.get_text_embedding(prompt2) - list_fract_mixing = np.linspace(0, 1, nmb_max_branches_lowres - 1) - for i in range(nmb_max_branches_lowres - 1): - print(f"Starting movie segment {i+1}/{nmb_max_branches_lowres-1}") - self.text_embedding1 = interpolate_linear(text_embeddingA, text_embeddingB, list_fract_mixing[i]) - self.text_embedding2 = interpolate_linear(text_embeddingA, text_embeddingB, 1 - list_fract_mixing[i]) - if i == 0: - recycle_img1 = False - else: - self.swap_forward() - recycle_img1 = True - - self.set_image1(imgs_lowres[i]) - self.set_image2(imgs_lowres[i + 1]) - - list_imgs = self.run_transition( - recycle_img1=recycle_img1, - recycle_img2=False, - num_inference_steps=num_inference_steps, - depth_strength=depth_strength, - nmb_max_branches=nmb_max_branches_highres) - list_imgs_interp = add_frames_linear_interp(list_imgs, fps, duration_single_segment) - - # Save movie frame - for img in list_imgs_interp: - ms.write_frame(img) - ms.finalize() @torch.no_grad() def get_mixed_conditioning(self, fract_mixing): - if self.dh.use_sd_xl: - text_embeddings_mix = [] - for i in range(len(self.text_embedding1)): - text_embeddings_mix.append(interpolate_linear(self.text_embedding1[i], self.text_embedding2[i], fract_mixing)) - list_conditionings = [text_embeddings_mix] - else: - text_embeddings_mix = interpolate_linear(self.text_embedding1, self.text_embedding2, fract_mixing) - list_conditionings = [text_embeddings_mix] + text_embeddings_mix = [] + for i in range(len(self.text_embedding1)): + if self.text_embedding1[i] is None: + mix = None + else: + mix = interpolate_linear(self.text_embedding1[i], self.text_embedding2[i], fract_mixing) + text_embeddings_mix.append(mix) + list_conditionings = [text_embeddings_mix] + return list_conditionings @torch.no_grad() @@ -857,22 +777,30 @@ if __name__ == "__main__": from diffusers_holder import DiffusersHolder from diffusers import DiffusionPipeline from diffusers import AutoencoderTiny - pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16") + # pretrained_model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0" + pretrained_model_name_or_path = "stabilityai/sdxl-turbo" + + + pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=torch.float16, variant="fp16") pipe.to("cuda") - pipe.vae = AutoencoderTiny.from_pretrained('madebyollin/taesdxl', torch_device='cuda', torch_dtype=torch.float16) - pipe.vae = pipe.vae.cuda() + # pipe.vae = AutoencoderTiny.from_pretrained('madebyollin/taesdxl', torch_device='cuda', torch_dtype=torch.float16) + # pipe.vae = pipe.vae.cuda() dh = DiffusersHolder(pipe) # %% Next let's set up all parameters size_output = (512, 512) - prompt1 = "underwater landscape, fish, und the sea, incredible detail, high resolution" + # size_output = (1024, 1024) + prompt1 = "photo of underwater landscape, fish, und the sea, incredible detail, high resolution" prompt2 = "rendering of an alien planet, strange plants, strange creatures, surreal" negative_prompt = "blurry, ugly, pale" # Optional + duration_transition = 12 # In seconds # Spawn latent blending lb = LatentBlending(dh) + # lb.dh.set_num_inference_steps(num_inference_steps) + lb.set_guidance_scale(0) lb.set_prompt1(prompt1) lb.set_prompt2(prompt2) lb.set_dimensions(size_output) @@ -880,7 +808,7 @@ if __name__ == "__main__": # Run latent blending - lb.run_transition(fixed_seeds=[420, 421]) + lb.run_transition(fixed_seeds=[420, 421], t_compute_max_allowed=15) # Save movie fp_movie = f'test.mp4' @@ -889,12 +817,3 @@ if __name__ == "__main__": #%% - - """ - checkout good tree for num inference steps - checkout that good nmb inference step given - - timing1: dt_per_diff rename and fix (first time run is super slow) - timing2: measure time for decoding - - """ \ No newline at end of file