Qwen-Image-Edit-2509 Lineart Interpolation
This is a LoRA weight for lineart interpolation, trained on randomly selected 10% of the train subset of Mixamo 240 dataset. The number of steps was 3000.
⚠️ Notes
- Still an attempt phase.
- Verification not enough, but works in some examples.
Quick start
import os
import math
import torch
import numpy as np
from PIL import Image
from diffusers import (
QwenImageEditPipeline,
FlowMatchEulerDiscreteScheduler,
)
from diffusers.utils.torch_utils import randn_tensor
# --------- ユーザー環境に合わせてここを設定 ---------
BASE_MODEL_ID = "Qwen/Qwen-Image-Edit-2509" # 学習時と同じ
LORA_DIR = "<PATH_TO_WEIGHT>"
DEVICE = "cuda"
DTYPE = torch.bfloat16
CONTROL1_IMAGE_PATH = <PATH_TO_START_IMAGE>
CONTROL2_IMAGE_PATH = <PATH_TO_END_IMAGE>
PROMPT = "<inbetween> middle frame" #Don't change
NEGATIVE_PROMPT = " "
NUM_STEPS = 30
SEED = 0
# ===================
def calculate_dimensions(target_area, ratio):
width = math.sqrt(target_area * ratio)
height = width / ratio
width = round(width / 32) * 32
height = round(height / 32) * 32
return int(width), int(height), None
def calculate_shift(
image_seq_len,
base_seq_len: int = 256,
max_seq_len: int = 4096,
base_shift: float = 0.5,
max_shift: float = 1.15,
):
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
b = base_shift - m * base_seq_len
mu = image_seq_len * m + b
return mu
def retrieve_timesteps(
scheduler,
num_inference_steps: int = None,
device: torch.device | str | None = None,
timesteps=None,
sigmas=None,
**kwargs,
):
if timesteps is not None and sigmas is not None:
raise ValueError("Only one of `timesteps` or `sigmas` can be passed.")
if timesteps is not None:
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
elif sigmas is not None:
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
else:
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
timesteps = scheduler.timesteps
return timesteps, num_inference_steps
def main():
torch.manual_seed(SEED)
# ---------- 1. ベースパイプライン + LoRA 読み込み ----------
pipe: QwenImageEditPipeline = QwenImageEditPipeline.from_pretrained(
BASE_MODEL_ID,
torch_dtype=DTYPE,
).to(DEVICE)
pipe.load_lora_weights(LORA_DIR, adapter_name="my_lora")
pipe.set_adapters(["my_lora"], adapter_weights=[1.0])
transformer = pipe.transformer.to(DEVICE, dtype=DTYPE)
vae = pipe.vae.to(DEVICE, dtype=DTYPE)
scheduler: FlowMatchEulerDiscreteScheduler = pipe.scheduler
vae_scale_factor = pipe.vae_scale_factor
# ------------------------------------------------------------
# ---------- 2. コントロール画像読み込み & 前処理 ----------
control1_img = Image.open(CONTROL1_IMAGE_PATH).convert("RGB")
control2_img = Image.open(CONTROL2_IMAGE_PATH).convert("RGB")
# Qwen公式と同じ resize ロジック
image_size = control1_img.size # (W, H)
calculated_width, calculated_height, _ = calculate_dimensions(
1024 * 1024, image_size[0] / image_size[1]
)
# VAE + patch pack 用に multiple_of で揃える
multiple_of = vae_scale_factor * 2
width = calculated_width // multiple_of * multiple_of
height = calculated_height // multiple_of * multiple_of
# 公式と同じ image_processor の使い方
control1_resized = pipe.image_processor.resize(control1_img, calculated_height, calculated_width)
control2_resized = pipe.image_processor.resize(control2_img, calculated_height, calculated_width)
control1_px = pipe.image_processor.preprocess(
control1_resized, calculated_height, calculated_width
).unsqueeze(2) # [B,3,1,H,W]
control2_px = pipe.image_processor.preprocess(
control2_resized, calculated_height, calculated_width
).unsqueeze(2)
control1_px = control1_px.to(DEVICE, DTYPE)
control2_px = control2_px.to(DEVICE, DTYPE)
bsz = control1_px.shape[0]
num_channels_latents = transformer.config.in_channels // 4 # = vae.config.z_dim
# ---------- 3. コントロール画像を latent token に ----------
with torch.no_grad():
# 公式の _encode_vae_image と同等の処理
ctrl1_latents_5d = pipe._encode_vae_image(control1_px, generator=None) # [B, C,1,H',W']
ctrl2_latents_5d = pipe._encode_vae_image(control2_px, generator=None)
H_lat, W_lat = ctrl1_latents_5d.shape[3], ctrl1_latents_5d.shape[4]
# pack して transformer 入力用 token 形式に
ctrl1_tokens = QwenImageEditPipeline._pack_latents(
ctrl1_latents_5d, bsz, num_channels_latents, H_lat, W_lat
) # [B, N, C*4]
ctrl2_tokens = QwenImageEditPipeline._pack_latents(
ctrl2_latents_5d, bsz, num_channels_latents, H_lat, W_lat
)
# ---------- 4. target latent をランダム初期化 ----------
# 公式の prepare_latents と同じ形状
height_lat = 2 * (height // (vae_scale_factor * 2))
width_lat = 2 * (width // (vae_scale_factor * 2))
shape = (bsz, 1, num_channels_latents, height_lat, width_lat)
latents_5d = randn_tensor(shape, device=DEVICE, dtype=DTYPE)
latents = QwenImageEditPipeline._pack_latents(
latents_5d, bsz, num_channels_latents, height_lat, width_lat
) # ここからはずっと [B,N,C*4] で回す
# ---------- 5. テキスト埋め込み ----------
with torch.no_grad():
prompt_embeds, prompt_embeds_mask = pipe.encode_prompt(
image=control1_resized, # 公式と同様: resize 済み画像
prompt=[PROMPT],
device=DEVICE,
num_images_per_prompt=1,
max_sequence_length=1024,
)
txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist()
# 3ストリーム分の img_shapes(rotary 用)
img_shapes = [[
(1, height_lat // 2, width_lat // 2), # target
(1, height_lat // 2, width_lat // 2), # control1
(1, height_lat // 2, width_lat // 2), # control2
]] * bsz
# 必要であれば rotary 埋め込みを事前計算する実装の場合:
# image_rotary_emb = transformer.pos_embed(img_shapes, txt_seq_lens, device=DEVICE)
# ---------- 6. scheduler timesteps 準備 (公式準拠) ----------
sigmas = np.linspace(1.0, 1.0 / NUM_STEPS, NUM_STEPS)
image_seq_len = latents.shape[1] # token 数
mu = calculate_shift(
image_seq_len,
scheduler.config.get("base_image_seq_len", 256),
scheduler.config.get("max_image_seq_len", 4096),
scheduler.config.get("base_shift", 0.5),
scheduler.config.get("max_shift", 1.15),
)
timesteps, _ = retrieve_timesteps(
scheduler,
NUM_STEPS,
device=DEVICE,
sigmas=sigmas,
mu=mu,
)
scheduler.set_begin_index(0)
# guidance は使わない(multi-control だけ)
guidance = None
# ---------- 7. 反復推論ループ ----------
with torch.no_grad():
for i, t in enumerate(timesteps):
# transformer 入力: target + control1 + control2 を token 軸で concat
latent_model_input = torch.cat([latents, ctrl1_tokens, ctrl2_tokens], dim=1)
# timestep をバッチ分にブロードキャスト
timestep_batch = t.expand(latents.shape[0]).to(latents.dtype)
model_pred_all = transformer(
hidden_states=latent_model_input,
timestep=timestep_batch / 1000.0, # 学習時と同じスケール
guidance=guidance,
encoder_hidden_states=prompt_embeds,
encoder_hidden_states_mask=prompt_embeds_mask,
img_shapes=img_shapes, # あなたの diffusers 版ではこれでOK
txt_seq_lens=txt_seq_lens,
# もし新しい API なら:
# image_rotary_emb=image_rotary_emb,
# attention_kwargs=None,
return_dict=False,
)[0] # [B, N_total, C*4]
# 先頭の target 分だけ取り出す
model_pred = model_pred_all[:, : latents.size(1)]
latents_dtype = latents.dtype
latents = scheduler.step(model_pred, t, latents, return_dict=False)[0]
if latents.dtype != latents_dtype:
latents = latents.to(latents_dtype)
# ---------- 8. decode ----------
# packed token -> 5D latent へ
latents = QwenImageEditPipeline._unpack_latents(
latents,
height=height,
width=width,
vae_scale_factor=vae_scale_factor,
) # [B, C,1,H_lat,W_lat]
latents = latents.to(vae.dtype)
# latents_mean / std を戻す(公式と同じ)
latents_mean = (
torch.tensor(vae.config.latents_mean)
.view(1, vae.config.z_dim, 1, 1, 1)
.to(latents.device, latents.dtype)
)
latents_std = 1.0 / torch.tensor(vae.config.latents_std).view(
1, vae.config.z_dim, 1, 1, 1
).to(latents.device, latents.dtype)
latents = latents / latents_std + latents_mean # [B,C,1,H_lat,W_lat]
# decode (T=1 の 0 フレームだけ使う)
with torch.no_grad():
image_latents = latents
decoded = vae.decode(image_latents, return_dict=False)[0] # [B,3,1,H,W]
images = decoded[:, :, 0, :, :] # [B,3,H,W]
# Qwen の image_processor で [-1,1] -> PIL
images = pipe.image_processor.postprocess(images, output_type="pil")
out: Image.Image = images[0]
out.save("multi_control_from_controls.png")
print("saved to multi_control_from_controls.png")
if __name__ == "__main__":
main()
Acknowledgement
- @lisiyao21 AnimeInbet
- @Qwen Qwen-Image-Edit-2509
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support
Model tree for EQUES/qwen-image-edit-2509-lineart-interpolation
Base model
Qwen/Qwen-Image-Edit-2509