I'm trying to figure this one out, been at it for a while and can't seem to make any headway. Any help is greatly appreciated!!! It is my first time using Stable Diffusion so maybe I'm missing something here but I was trying to follow the HuggingFace tutorial https://huggingface.co/docs/diffusers/v0.20.0/en/api/pipelines/stable_diffusion/stable_diffusion_xl#1-ensemble-of-expert-denoisers
It was working before but now I'm trying to specify height and width...that seems to be when the problem started?
I've also tried adding requires_aesthetics_score=True to before sending refiner to cuda but that doesn't work -- same error.
ValueError Traceback (most recent call last)
Cell In[74], line 1
----> 1 refiner_image = refiner(
2 prompt="cartoon of colorful monsters frolocking in a dark spooky graveyard with tombstones and graves behind a castle",
3 num_inference_steps=n_steps,
4 denoising_end=high_noise_frac,
5 image=img
6 ).images[0]
File c:\Users\Mark\anaconda3\envs\auto_content_creator\lib\site-packages\torch\utils\_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File c:\Users\Mark\anaconda3\envs\auto_content_creator\lib\site-packages\diffusers\pipelines\stable_diffusion_xl\pipeline_stable_diffusion_xl_img2img.py:910, in StableDiffusionXLImg2ImgPipeline.__call__(self, prompt, prompt_2, image, strength, num_inference_steps, denoising_start, denoising_end, guidance_scale, negative_prompt, negative_prompt_2, num_images_per_prompt, eta, generator, latents, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, output_type, return_dict, callback, callback_steps, cross_attention_kwargs, guidance_rescale, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score)
908 # 8. Prepare added time ids & embeddings
909 add_text_embeds = pooled_prompt_embeds
--> 910 add_time_ids, add_neg_time_ids = self._get_add_time_ids(
911 original_size,
912 crops_coords_top_left,
913 target_size,
914 aesthetic_score,
915 negative_aesthetic_score,
916 dtype=prompt_embeds.dtype,
917 )
918 add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
920 if do_classifier_free_guidance:
File c:\Users\Mark\anaconda3\envs\auto_content_creator\lib\site-packages\diffusers\pipelines\stable_diffusion_xl\pipeline_stable_diffusion_xl_img2img.py:613, in StableDiffusionXLImg2ImgPipeline._get_add_time_ids(self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype)
607 expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
609 if (
610 expected_add_embed_dim > passed_add_embed_dim
611 and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
612 ):
--> 613 raise ValueError(
614 f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
615 )
616 elif (
617 expected_add_embed_dim < passed_add_embed_dim
618 and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
619 ):
620 raise ValueError(
621 f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
622 )
ValueError: Model expects an added time embedding vector of length 2816, but a vector of 2560 was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` 6.0 and `negative_aesthetic_score` 2.5 is correctly used by the model.
My code is:
from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
import torch
import os
base = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
)
base.to("cuda")
refiner = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-refiner-1.0",
**base.components
)
refiner.to("cuda")
refiner.register_to_config(requires_aesthetics_score=True)
n_steps = 40
high_noise_frac = 0.8
def text_to_image(prompt):
base_image = base(
prompt=prompt,
num_inference_steps=n_steps,
denoising_end=high_noise_frac,
output_type="latent",
height=640,
width=1536
).images
refiner_image = refiner(
prompt=prompt,
num_inference_steps=n_steps,
denoising_end=high_noise_frac,
image=base_image
).images[0]
return refiner_image
img = text_to_image("cartoon of colorful monsters frolocking in a dark spooky graveyard with tombstones and graves behind a castle")