Videos
import streamlit as st
import torch
from diffusers import StableVideoDiffusionPipeline, StableDiffusionPipeline
from moviepy.editor import VideoFileClip, AudioFileClip
import requests
import os
import subprocess
import cv2 # For photo handling
from PIL import Image
import tempfile
# SadTalker inference function (adjust path to your SadTalker dir)
def run_sadtalker(source_image_path, driven_audio_path, output_dir=»results»):
# Run SadTalker CLI (assumes SadTalker is in current dir or adjust path)
cmd = [
«python», «SadTalker/inference.py»,
«–driven_audio», driven_audio_path,
«–source_image», source_image_path,
«–result_dir», output_dir,
«–still», # Optional: Minimize head movement for preservation
«–preprocess», «crop», # Crop to face for better sync
«–enhancer», «gfpgan» # Preserve/enhance face details
]
subprocess.run(cmd, check=True)
# Output video is in output_dir (e.g., «result.mp4»)
for file in os.listdir(output_dir):
if file.endswith(«.mp4»):
return os.path.join(output_dir, file)
raise ValueError(«No output video generated.»)
# ElevenLabs TTS (same as before)
ELEVENLABS_API_KEY = st.secrets.get(«ELEVENLABS_API_KEY», «your_key_here»)
def generate_audio(text, voice_id=»21m00Tcm4TlvDq8ikWAM»):
url = f»https://api.elevenlabs.io/v1/text-to-speech/{voice_id}»
headers = {«xi-api-key»: ELEVENLABS_API_KEY}
data = {
«text»: text,
«model_id»: «eleven_monolingual_v1»,
«voice_settings»: {«stability»: 0.5, «similarity_boost»: 0.5}
}
response = requests.post(url, json=data, headers=headers)
if response.status_code == 200:
audio_path = «temp_audio.wav»
with open(audio_path, «wb») as f:
f.write(response.content)
return audio_path
else:
st.error(«Audio generation failed. Using fallback.»)
from gtts import gTTS # pip install gtts
tts = gTTS(text)
audio_path = «temp_audio.wav»
tts.save(audio_path)
return audio_path
# Original video generation (for non-photo mode)
@st.cache_resource
def load_video_models():
device = «cuda» if torch.cuda.is_available() else «cpu»
video_pipe = StableVideoDiffusionPipeline.from_pretrained(
«stabilityai/stable-video-diffusion-img2vid-xt», torch_dtype=torch.float16
).to(device)
img_pipe = StableDiffusionPipeline.from_pretrained(«CompVis/stable-diffusion-v1-4»).to(device)
return video_pipe, img_pipe, device
def generate_text_to_video(prompt, num_frames=49, height=576, width=1024):
video_pipe, img_pipe, device = load_video_models()
init_image = img_pipe(prompt).images[0]
init_image.save(«temp_image.png»)
generator = torch.manual_seed(42)
frames = video_pipe(init_image, num_frames=num_frames, height=height, width=width, generator=generator).frames[0]
from diffusers.utils import export_to_video
video_path = export_to_video(frames, «temp_video.mp4»)
return video_path
# Wav2Lip for fallback sync (if not using SadTalker)
def lip_sync_video(video_path, audio_path):
output_path = «synced_video.mp4»
checkpoint_path = «checkpoints/wav2lip_gan.pth» # Download if needed
subprocess.run([
«python», «Wav2Lip/inference.py»,
«–checkpoint_path», checkpoint_path,
«–face», video_path,
«–audio», audio_path,
«–outfile», output_path,
«–resize_factor», «1» # Preserve resolution
])
return output_path
def upscale_to_hd(input_video):
output_hd = «final_hd_video.mp4»
subprocess.run([
«ffmpeg», «-i», input_video, «-vf», «scale=1920:1080:flags=lanczos»,
«-c:v», «libx264», «-crf», «18», «-preset», «slow», output_hd # High quality
], check=True)
return output_hd
def create_static_video_from_photo(photo_path, duration=5, fps=25):
# Create a static video from photo (for Wav2Lip fallback)
img = cv2.imread(photo_path)
height, width = img.shape[:2]
fourcc = cv2.VideoWriter_fourcc(*’mp4v’)
out = cv2.VideoWriter(«static_video.mp4», fourcc, fps, (width, height))
for _ in range(int(duration * fps)):
out.write(img)
out.release()
return «static_video.mp4»
# Streamlit UI
st.title(«AI Video Generator: Text-to-Video or Photo-to-Talking-Video»)
script = st.text_area(«Enter your script/dialogue:», height=100)
video_prompt = st.text_input(«Video style prompt (for text-to-video mode):»)
uploaded_photo = st.file_uploader(«Upload a photo (JPG/PNG) for animation (optional):», type=[«jpg», «png»])
if st.button(«Generate Video»):
if not script:
st.warning(«Please provide a script.»)
else:
with st.spinner(«Generating… This may take 5-30 minutes depending on mode.»):
audio_path = generate_audio(script)
if uploaded_photo is not None:
# Photo-to-Video Mode: Use SadTalker for preservation and sync
st.info(«Using photo-to-talking-video mode. Preserving character identity.»)
# Save uploaded photo
with tempfile.NamedTemporaryFile(delete=False, suffix=».png») as tmp:
img = Image.open(uploaded_photo)
img.save(tmp.name)
photo_path = tmp.name
# Generate with SadTalker (handles animation + sync)
output_dir = tempfile.mkdtemp()
synced_path = run_sadtalker(photo_path, audio_path, output_dir)
# Upscale to HD
final_path = upscale_to_hd(synced_path)
# Clean up
os.unlink(photo_path)
os.remove(audio_path) # SadTalker embeds audio
else:
# Text-to-Video Mode (original)
if not video_prompt:
st.warning(«Provide a video prompt for text-to-video.»)
else:
st.info(«Using text-to-video mode.»)
video_path = generate_text_to_video(video_prompt)
# For better sync in text mode, create static if no face, but assume generated has face
synced_path = lip_sync_video(video_path, audio_path)
final_path = upscale_to_hd(synced_path)
os.remove(audio_path)
# Final merge if needed (SadTalker/Wav2Lip usually include audio)
if os.path.exists(final_path):
st.video(final_path)
with open(final_path, «rb») as f:
st.download_button(«Download HD Video», f, file_name=»generated_video.mp4″)
else:
st.error(«Generation failed. Check logs.»)
![]()
