Spaces:

AswinMathew
/

anime-gen-api

Sleeping

App Files Files Community

anime-gen-api / test_cut01_pipeline.py

AswinMathew

Upload folder using huggingface_hub

7190fd0 verified 3 months ago

raw

history blame contribute delete

10.1 kB

	"""End-to-end pipeline test for a single cut (C01).

	Tests the full flow: storyboard → image prompt assembly → image gen →
	TTS → video gen (img2vid) → audio mux → final clip with voice.

	Cost: ~0.006 pollen (1 image + 1 video clip)
	"""
	import asyncio
	import json
	import math
	import sys
	import time
	from pathlib import Path

	sys.path.insert(0, ".")

	from app.utils.prompt_builder import build_image_prompt, _build_identity_anchor
	from app.services.pollinations import generate_image, generate_video, upload_media
	from app.services.tts import generate_tts
	from app.services.ffmpeg import mux_audio, get_duration

	OUTPUT_DIR = Path("test_cut01_output")

	# Characters (same as test_storyboard.py) — name -> visual_prompt
	CHARACTERS = {
	"Ye Chen": (
	"young adult male, jet black short messy hair with side-swept bangs, "
	"deep amber eyes, fair skin, athletic build, "
	"wearing white inner sect disciple robes with silver trim, "
	"determined intense expression"
	),
	"Gu Changge": (
	"young adult male, long flowing silver-white hair, "
	"cold piercing violet eyes, pale ivory skin, tall elegant build, "
	"wearing luxurious dark purple and gold noble robes with intricate embroidery, "
	"calm indifferent expression"
	),
	"Taixuan Holy Lord": (
	"middle-aged male, dark brown hair tied in a topknot, "
	"golden glowing eyes, warm bronze skin, imposing muscular build, "
	"wearing ornate golden and white holy lord ceremonial robes with dragon motifs, "
	"stern authoritative expression"
	),
	"Holy Maiden": (
	"young adult female, long flowing black hair with jade hairpin, "
	"autumn water-like gentle brown eyes, porcelain skin, slim graceful build, "
	"wearing elegant green silk dress with fluttering sleeves, "
	"ethereal calm expression like a fairy"
	),
	}

	# Voice configs
	VOICE_CONFIGS = {
	"Ye Chen": {"voice_name": "en-US-AndrewNeural", "rate": "+5%", "pitch": "+2Hz"},
	"Narrator": {"voice_name": "en-US-GuyNeural", "rate": "-5%", "pitch": "-5Hz"},
	"Taixuan Holy Lord": {"voice_name": "en-US-RogerNeural", "rate": "-10%", "pitch": "-10Hz"},
	}


	async def main():
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	# ═══ Step 1: Load storyboard ═══
	print("=" * 70)
	print("CUT 01 END-TO-END PIPELINE TEST")
	print("=" * 70)

	storyboard_path = Path("test_storyboard_output.json")
	if not storyboard_path.exists():
	print("ERROR: Run test_storyboard.py first to generate the storyboard")
	return

	with open(storyboard_path, "r", encoding="utf-8") as f:
	storyboard = json.load(f)

	# Extract C01
	cut01 = None
	for scene in storyboard["scenes"]:
	for cut in scene.get("cuts", scene.get("shots", [])):
	if cut.get("cut_id") == "C01":
	cut01 = cut
	break
	if cut01:
	break

	if not cut01:
	print("ERROR: C01 not found in storyboard")
	return

	print(f"\nCut: {cut01['cut_id']}")
	print(f"Shot type: {cut01['shot_type']}")
	print(f"Camera: {cut01['camera_movement']}")
	print(f"Duration: {cut01['duration_sec']}s")
	print(f"Characters: {cut01['characters_present']}")
	print(f"Focal: {cut01['focal_character']}")
	print(f"Speaker: {cut01['dialogue']['speaker']}")
	print(f"Dialogue: {cut01['dialogue']['text'][:80]}...")

	# ═══ Step 2: Assemble image prompt ═══
	print(f"\n{'-' * 70}")
	print("STEP 2: Image Prompt Assembly")
	print(f"{'-' * 70}")

	image_prompt = build_image_prompt(cut01, CHARACTERS)
	print(f"\nAssembled prompt ({len(image_prompt)} chars):")
	print(f"\n{image_prompt}")

	# Show identity anchor for reference
	for char_name in cut01["characters_present"]:
	if char_name in CHARACTERS:
	anchor = _build_identity_anchor(CHARACTERS[char_name])
	print(f"\nIdentity anchor for {char_name}: {anchor}")

	# ═══ Step 3: Generate image ═══
	print(f"\n{'-' * 70}")
	print("STEP 3: Image Generation (imagen-4)")
	print(f"{'-' * 70}")

	image_path = str(OUTPUT_DIR / "C01_keyframe.png")

	# Model priority: try imagen-4, fallback to grok-imagine, then flux
	IMAGE_MODELS = ["imagen-4", "grok-imagine", "flux"]

	if Path(image_path).exists():
	print(f" CACHED: {image_path}")
	else:
	for model in IMAGE_MODELS:
	t0 = time.time()
	print(f" Trying {model}...")
	try:
	await generate_image(
	prompt=image_prompt,
	output_path=image_path,
	model=model,
	width=1024,
	height=768,
	)
	elapsed = time.time() - t0
	print(f" OK — {model}, saved to {image_path} ({elapsed:.1f}s)")
	break
	except Exception as e:
	print(f" FAILED ({model}): {e}")
	if model == IMAGE_MODELS[-1]:
	print(" All models failed!")
	return
	print(f" Trying next model...")
	await asyncio.sleep(3)

	# ═══ Step 4: Generate TTS ═══
	print(f"\n{'-' * 70}")
	print("STEP 4: TTS Generation (Edge TTS)")
	print(f"{'-' * 70}")

	audio_path = str(OUTPUT_DIR / "C01_voice.mp3")
	dialogue = cut01["dialogue"]
	speaker = dialogue["speaker"]
	voice_config = VOICE_CONFIGS.get(speaker, VOICE_CONFIGS["Narrator"])

	if Path(audio_path).exists():
	print(f" CACHED: {audio_path}")
	tts_duration = await get_duration(audio_path)
	else:
	t0 = time.time()
	print(f" Speaker: {speaker}")
	print(f" Voice: {voice_config['voice_name']}")
	print(f" Emotion: {dialogue.get('emotion', 'neutral')}")
	print(f" Text: {dialogue['text'][:100]}...")

	tts_result = await generate_tts(
	text=dialogue["text"],
	output_path=audio_path,
	voice_name=voice_config["voice_name"],
	rate=voice_config.get("rate", "+0%"),
	pitch=voice_config.get("pitch", "+0Hz"),
	emotion=dialogue.get("emotion", "neutral"),
	)
	tts_duration = tts_result["duration_sec"]
	elapsed = time.time() - t0
	print(f" OK — {tts_duration:.2f}s audio ({elapsed:.1f}s)")
	print(f" Word timestamps: {len(tts_result.get('word_timestamps', []))} words")

	print(f"\n TTS duration: {tts_duration:.2f}s")
	print(f" Storyboard duration: {cut01['duration_sec']}s")

	# ═══ Step 5: Generate video (img2vid) ═══
	print(f"\n{'-' * 70}")
	print("STEP 5: Video Generation (grok-video img2vid)")
	print(f"{'-' * 70}")

	silent_video_path = str(OUTPUT_DIR / "C01_silent.mp4")
	video_duration = max(1, min(math.ceil(tts_duration), 10))

	# Build video prompt
	video_prompt_parts = []
	if cut01.get("video_prompt"):
	video_prompt_parts.append(cut01["video_prompt"])
	if cut01.get("action_description"):
	video_prompt_parts.append(cut01["action_description"])
	video_prompt = ", ".join(video_prompt_parts) if video_prompt_parts else "subtle idle animation"

	print(f" Video prompt: {video_prompt[:120]}...")
	print(f" Video duration: {video_duration}s (ceil of {tts_duration:.2f}s TTS)")

	if Path(silent_video_path).exists():
	print(f" CACHED (silent): {silent_video_path}")
	else:
	# Upload keyframe image
	t0 = time.time()
	print(" Uploading keyframe to media.pollinations.ai...")
	image_url = await upload_media(image_path)
	print(f" Image URL: {image_url[:80]}...")

	print(" Generating video via grok-video (this takes 2-3 minutes)...")
	await generate_video(
	prompt=video_prompt,
	output_path=silent_video_path,
	model="grok-video",
	duration=video_duration,
	image_url=image_url,
	)
	elapsed = time.time() - t0
	file_size = Path(silent_video_path).stat().st_size
	print(f" OK — {file_size // 1024}KB silent video ({elapsed:.1f}s)")

	# ═══ Step 6: Mux audio into video ═══
	print(f"\n{'-' * 70}")
	print("STEP 6: Audio Mux (FFmpeg)")
	print(f"{'-' * 70}")

	final_clip_path = str(OUTPUT_DIR / "C01_final.mp4")

	t0 = time.time()
	video_dur = await get_duration(silent_video_path)
	audio_dur = await get_duration(audio_path)
	print(f" Silent video: {video_dur:.2f}s")
	print(f" TTS audio: {audio_dur:.2f}s")
	print(f" Mismatch: {abs(video_dur - audio_dur):.2f}s")

	if audio_dur > video_dur + 0.5:
	print(f" Strategy: freeze last frame + pad {audio_dur - video_dur:.1f}s")
	elif video_dur > audio_dur:
	print(f" Strategy: trim video to match audio (-shortest)")
	else:
	print(f" Strategy: durations match, simple remux")

	await mux_audio(
	video_path=silent_video_path,
	audio_path=audio_path,
	output_path=final_clip_path,
	duration_sec=tts_duration,
	)
	elapsed = time.time() - t0

	final_dur = await get_duration(final_clip_path)
	final_size = Path(final_clip_path).stat().st_size
	print(f" OK — final clip: {final_dur:.2f}s, {final_size // 1024}KB ({elapsed:.1f}s)")

	# ═══ Summary ═══
	print(f"\n{'=' * 70}")
	print("RESULTS")
	print(f"{'=' * 70}")
	print(f" Keyframe image: {image_path}")
	print(f" TTS audio: {audio_path} ({audio_dur:.2f}s)")
	print(f" Silent video: {silent_video_path} ({video_dur:.2f}s)")
	print(f" Final clip: {final_clip_path} ({final_dur:.2f}s)")
	print(f"\n Pollen cost: ~0.006 (0.0025 image + 0.003/s × {video_duration}s video)")
	print(f"\n Open {OUTPUT_DIR.resolve()} to review the outputs!")
	print(f" Play {final_clip_path} to see the final result with voice!")


	if __name__ == "__main__":
	asyncio.run(main())