Videos

import streamlit as st import torch from diffusers import StableVideoDiffusionPipeline, StableDiffusionPipeline from moviepy.editor import VideoFileClip, AudioFileClip import requests import os import subprocess import cv2 # For photo handling from PIL import Image import tempfile # SadTalker inference function (adjust path to your SadTalker dir) def run_sadtalker(source_image_path, driven_audio_path, output_dir=»results»): # Run SadTalker CLI (assumes SadTalker is in current dir or adjust path) cmd = [ «python», «SadTalker/inference.py», «–driven_audio», driven_audio_path, «–source_image», source_image_path, «–result_dir», output_dir, «–still», # Optional: Minimize head movement for preservation «–preprocess», «crop», # Crop to face for better sync «–enhancer», «gfpgan» # Preserve/enhance face details ] subprocess.run(cmd, check=True) # Output video is in output_dir (e.g., «result.mp4») for file in os.listdir(output_dir): if file.endswith(«.mp4»): return os.path.join(output_dir, file) raise ValueError(«No output video generated.») # ElevenLabs TTS (same as before) ELEVENLABS_API_KEY = st.secrets.get(«ELEVENLABS_API_KEY», «your_key_here») def generate_audio(text, voice_id=»21m00Tcm4TlvDq8ikWAM»): url = f»https://api.elevenlabs.io/v1/text-to-speech/{voice_id}» headers = {«xi-api-key»: ELEVENLABS_API_KEY} data = { «text»: text, «model_id»: «eleven_monolingual_v1», «voice_settings»: {«stability»: 0.5, «similarity_boost»: 0.5} } response = requests.post(url, json=data, headers=headers) if response.status_code == 200: audio_path = «temp_audio.wav» with open(audio_path, «wb») as f: f.write(response.content) return audio_path else: st.error(«Audio generation failed. Using fallback.») from gtts import gTTS # pip install gtts tts = gTTS(text) audio_path = «temp_audio.wav» tts.save(audio_path) return audio_path # Original video generation (for non-photo mode) @st.cache_resource def load_video_models(): device = «cuda» if torch.cuda.is_available() else «cpu» video_pipe = StableVideoDiffusionPipeline.from_pretrained( «stabilityai/stable-video-diffusion-img2vid-xt», torch_dtype=torch.float16 ).to(device) img_pipe = StableDiffusionPipeline.from_pretrained(«CompVis/stable-diffusion-v1-4»).to(device) return video_pipe, img_pipe, device def generate_text_to_video(prompt, num_frames=49, height=576, width=1024): video_pipe, img_pipe, device = load_video_models() init_image = img_pipe(prompt).images[0] init_image.save(«temp_image.png») generator = torch.manual_seed(42) frames = video_pipe(init_image, num_frames=num_frames, height=height, width=width, generator=generator).frames[0] from diffusers.utils import export_to_video video_path = export_to_video(frames, «temp_video.mp4») return video_path # Wav2Lip for fallback sync (if not using SadTalker) def lip_sync_video(video_path, audio_path): output_path = «synced_video.mp4» checkpoint_path = «checkpoints/wav2lip_gan.pth» # Download if needed subprocess.run([ «python», «Wav2Lip/inference.py», «–checkpoint_path», checkpoint_path, «–face», video_path, «–audio», audio_path, «–outfile», output_path, «–resize_factor», «1» # Preserve resolution ]) return output_path def upscale_to_hd(input_video): output_hd = «final_hd_video.mp4» subprocess.run([ «ffmpeg», «-i», input_video, «-vf», «scale=1920:1080:flags=lanczos», «-c:v», «libx264», «-crf», «18», «-preset», «slow», output_hd # High quality ], check=True) return output_hd def create_static_video_from_photo(photo_path, duration=5, fps=25): # Create a static video from photo (for Wav2Lip fallback) img = cv2.imread(photo_path) height, width = img.shape[:2] fourcc = cv2.VideoWriter_fourcc(*’mp4v’) out = cv2.VideoWriter(«static_video.mp4», fourcc, fps, (width, height)) for _ in range(int(duration * fps)): out.write(img) out.release() return «static_video.mp4» # Streamlit UI st.title(«AI Video Generator: Text-to-Video or Photo-to-Talking-Video») script = st.text_area(«Enter your script/dialogue:», height=100) video_prompt = st.text_input(«Video style prompt (for text-to-video mode):») uploaded_photo = st.file_uploader(«Upload a photo (JPG/PNG) for animation (optional):», type=[«jpg», «png»]) if st.button(«Generate Video»): if not script: st.warning(«Please provide a script.») else: with st.spinner(«Generating… This may take 5-30 minutes depending on mode.»): audio_path = generate_audio(script) if uploaded_photo is not None: # Photo-to-Video Mode: Use SadTalker for preservation and sync st.info(«Using photo-to-talking-video mode. Preserving character identity.») # Save uploaded photo with tempfile.NamedTemporaryFile(delete=False, suffix=».png») as tmp: img = Image.open(uploaded_photo) img.save(tmp.name) photo_path = tmp.name # Generate with SadTalker (handles animation + sync) output_dir = tempfile.mkdtemp() synced_path = run_sadtalker(photo_path, audio_path, output_dir) # Upscale to HD final_path = upscale_to_hd(synced_path) # Clean up os.unlink(photo_path) os.remove(audio_path) # SadTalker embeds audio else: # Text-to-Video Mode (original) if not video_prompt: st.warning(«Provide a video prompt for text-to-video.») else: st.info(«Using text-to-video mode.») video_path = generate_text_to_video(video_prompt) # For better sync in text mode, create static if no face, but assume generated has face synced_path = lip_sync_video(video_path, audio_path) final_path = upscale_to_hd(synced_path) os.remove(audio_path) # Final merge if needed (SadTalker/Wav2Lip usually include audio) if os.path.exists(final_path): st.video(final_path) with open(final_path, «rb») as f: st.download_button(«Download HD Video», f, file_name=»generated_video.mp4″) else: st.error(«Generation failed. Check logs.»)