RADIO FUSION MUSICAL

RADIO FUSION MUSICAL

Desde San Fernando Chile, la radio digital sin limites

Videos

import streamlit as st import torch from diffusers import StableVideoDiffusionPipeline, StableDiffusionPipeline from moviepy.editor import VideoFileClip, AudioFileClip import requests import os import subprocess import cv2 # For photo handling from PIL import Image import tempfile # SadTalker inference function (adjust path to your SadTalker dir) def run_sadtalker(source_image_path, driven_audio_path, output_dir=»results»): # Run SadTalker CLI (assumes SadTalker is in current dir or adjust path) cmd = [ «python», «SadTalker/inference.py», «–driven_audio», driven_audio_path, «–source_image», source_image_path, «–result_dir», output_dir, «–still», # Optional: Minimize head movement for preservation «–preprocess», «crop», # Crop to face for better sync «–enhancer», «gfpgan» # Preserve/enhance face details ] subprocess.run(cmd, check=True) # Output video is in output_dir (e.g., «result.mp4») for file in os.listdir(output_dir): if file.endswith(«.mp4»): return os.path.join(output_dir, file) raise ValueError(«No output video generated.») # ElevenLabs TTS (same as before) ELEVENLABS_API_KEY = st.secrets.get(«ELEVENLABS_API_KEY», «your_key_here») def generate_audio(text, voice_id=»21m00Tcm4TlvDq8ikWAM»): url = f»https://api.elevenlabs.io/v1/text-to-speech/{voice_id}» headers = {«xi-api-key»: ELEVENLABS_API_KEY} data = { «text»: text, «model_id»: «eleven_monolingual_v1», «voice_settings»: {«stability»: 0.5, «similarity_boost»: 0.5} } response = requests.post(url, json=data, headers=headers) if response.status_code == 200: audio_path = «temp_audio.wav» with open(audio_path, «wb») as f: f.write(response.content) return audio_path else: st.error(«Audio generation failed. Using fallback.») from gtts import gTTS # pip install gtts tts = gTTS(text) audio_path = «temp_audio.wav» tts.save(audio_path) return audio_path # Original video generation (for non-photo mode) @st.cache_resource def load_video_models(): device = «cuda» if torch.cuda.is_available() else «cpu» video_pipe = StableVideoDiffusionPipeline.from_pretrained( «stabilityai/stable-video-diffusion-img2vid-xt», torch_dtype=torch.float16 ).to(device) img_pipe = StableDiffusionPipeline.from_pretrained(«CompVis/stable-diffusion-v1-4»).to(device) return video_pipe, img_pipe, device def generate_text_to_video(prompt, num_frames=49, height=576, width=1024): video_pipe, img_pipe, device = load_video_models() init_image = img_pipe(prompt).images[0] init_image.save(«temp_image.png») generator = torch.manual_seed(42) frames = video_pipe(init_image, num_frames=num_frames, height=height, width=width, generator=generator).frames[0] from diffusers.utils import export_to_video video_path = export_to_video(frames, «temp_video.mp4») return video_path # Wav2Lip for fallback sync (if not using SadTalker) def lip_sync_video(video_path, audio_path): output_path = «synced_video.mp4» checkpoint_path = «checkpoints/wav2lip_gan.pth» # Download if needed subprocess.run([ «python», «Wav2Lip/inference.py», «–checkpoint_path», checkpoint_path, «–face», video_path, «–audio», audio_path, «–outfile», output_path, «–resize_factor», «1» # Preserve resolution ]) return output_path def upscale_to_hd(input_video): output_hd = «final_hd_video.mp4» subprocess.run([ «ffmpeg», «-i», input_video, «-vf», «scale=1920:1080:flags=lanczos», «-c:v», «libx264», «-crf», «18», «-preset», «slow», output_hd # High quality ], check=True) return output_hd def create_static_video_from_photo(photo_path, duration=5, fps=25): # Create a static video from photo (for Wav2Lip fallback) img = cv2.imread(photo_path) height, width = img.shape[:2] fourcc = cv2.VideoWriter_fourcc(*’mp4v’) out = cv2.VideoWriter(«static_video.mp4», fourcc, fps, (width, height)) for _ in range(int(duration * fps)): out.write(img) out.release() return «static_video.mp4» # Streamlit UI st.title(«AI Video Generator: Text-to-Video or Photo-to-Talking-Video») script = st.text_area(«Enter your script/dialogue:», height=100) video_prompt = st.text_input(«Video style prompt (for text-to-video mode):») uploaded_photo = st.file_uploader(«Upload a photo (JPG/PNG) for animation (optional):», type=[«jpg», «png»]) if st.button(«Generate Video»): if not script: st.warning(«Please provide a script.») else: with st.spinner(«Generating… This may take 5-30 minutes depending on mode.»): audio_path = generate_audio(script) if uploaded_photo is not None: # Photo-to-Video Mode: Use SadTalker for preservation and sync st.info(«Using photo-to-talking-video mode. Preserving character identity.») # Save uploaded photo with tempfile.NamedTemporaryFile(delete=False, suffix=».png») as tmp: img = Image.open(uploaded_photo) img.save(tmp.name) photo_path = tmp.name # Generate with SadTalker (handles animation + sync) output_dir = tempfile.mkdtemp() synced_path = run_sadtalker(photo_path, audio_path, output_dir) # Upscale to HD final_path = upscale_to_hd(synced_path) # Clean up os.unlink(photo_path) os.remove(audio_path) # SadTalker embeds audio else: # Text-to-Video Mode (original) if not video_prompt: st.warning(«Provide a video prompt for text-to-video.») else: st.info(«Using text-to-video mode.») video_path = generate_text_to_video(video_prompt) # For better sync in text mode, create static if no face, but assume generated has face synced_path = lip_sync_video(video_path, audio_path) final_path = upscale_to_hd(synced_path) os.remove(audio_path) # Final merge if needed (SadTalker/Wav2Lip usually include audio) if os.path.exists(final_path): st.video(final_path) with open(final_path, «rb») as f: st.download_button(«Download HD Video», f, file_name=»generated_video.mp4″) else: st.error(«Generation failed. Check logs.»)

Loading