vaibhavpandeyvpz's picture
Use default theme
fd64439
import spaces
import torch
import os
import tempfile
import time
from contextlib import nullcontext
from functools import lru_cache
from typing import Any
import gradio as gr
import numpy as np
from diffusers import DiffusionPipeline
from gradio_litmodel3d import LitModel3D
from huggingface_hub import login
from PIL import Image
# Authenticate with Hugging Face using token from environment
# HF_TOKEN is automatically available in Hugging Face Spaces
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
# Login to Hugging Face - this stores the token for all HF Hub operations
login(token=hf_token)
# Also ensure it's set as environment variable for any libraries that check it directly
os.environ["HF_TOKEN"] = hf_token
print("Authenticated with Hugging Face")
else:
print("Warning: HF_TOKEN not found. Gated models may not be accessible.")
print("Please ensure HF_TOKEN is set in your Space's secrets.")
if not torch.cuda.is_available():
raise Exception("CUDA is not available")
# Set environment variables for building texture_baker and uv_unwrapper
os.environ["USE_CUDA"] = "1"
os.environ["USE_NATIVE_ARCH"] = "0" # Disable native arch to avoid build issues
def build_texture_baker_and_uv_unwrapper():
# Set CUDA architecture list to avoid detection issues
# PyTorch's build system fails when it can't detect GPU architectures
# Setting TORCH_CUDA_ARCH_LIST explicitly prevents this error
if torch.cuda.is_available():
try:
# Try to get the actual compute capability
compute_cap = torch.cuda.get_device_capability(0)
cuda_arch = f"{compute_cap[0]}.{compute_cap[1]}"
os.environ["TORCH_CUDA_ARCH_LIST"] = cuda_arch
print(
f"Detected CUDA capability: {cuda_arch}, setting TORCH_CUDA_ARCH_LIST={cuda_arch}"
)
except Exception as e:
# Fallback to common architectures if detection fails
# Include multiple architectures to support various GPU models
fallback_archs = "7.0;7.5;8.0;8.6;8.9;9.0"
os.environ["TORCH_CUDA_ARCH_LIST"] = fallback_archs
print(
f"Could not detect CUDA capability: {e}, using fallback architectures: {fallback_archs}"
)
else:
# Should not happen since we check above, but just in case
print("Warning: CUDA not available but trying to build with CUDA support")
os.system(
"USE_CUDA=1 USE_NATIVE_ARCH=0 pip install -vv --no-build-isolation ./texture_baker ./uv_unwrapper"
)
build_texture_baker_and_uv_unwrapper()
import sf3d.utils as sf3d_utils
from sf3d.system import SF3D
# Set up environment
os.environ["GRADIO_TEMP_DIR"] = os.path.join(os.environ.get("TMPDIR", "/tmp"), "gradio")
# Constants for 3D generation
COND_WIDTH = 512
COND_HEIGHT = 512
COND_DISTANCE = 1.6
COND_FOVY_DEG = 40
BACKGROUND_COLOR = [0.5, 0.5, 0.5]
# Cached. Doesn't change
c2w_cond = sf3d_utils.default_cond_c2w(COND_DISTANCE)
intrinsic, intrinsic_normed_cond = sf3d_utils.create_intrinsic_from_fov_deg(
COND_FOVY_DEG, COND_HEIGHT, COND_WIDTH
)
generated_files = []
# Initialize device and SF3D model (like official app)
device = sf3d_utils.get_device()
# SF3D model - initialized at startup like official app
# Token is automatically used after login() call above
print("Loading SF3D model...")
sf3d_model = SF3D.from_pretrained(
"stabilityai/stable-fast-3d",
config_name="config.yaml",
weight_name="model.safetensors",
)
sf3d_model.eval()
sf3d_model = sf3d_model.to(device)
print("SF3D model loaded!")
# SDXL pipeline - initialized at startup
print("Loading Stable Diffusion XL model...")
sd_pipeline = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
use_safetensors=True,
variant="fp16" if device == "cuda" else None,
)
if device == "cuda":
sd_pipeline = sd_pipeline.to(device)
# VAE needs to be in float32 for proper decoding (fixes black image issue)
sd_pipeline.vae.to(torch.float32)
# Enable VAE slicing for better memory and precision handling
try:
sd_pipeline.enable_vae_slicing()
except:
pass
# Enable memory efficient attention if available
try:
sd_pipeline.enable_xformers_memory_efficient_attention()
except:
pass
elif device == "mps":
sd_pipeline = sd_pipeline.to(device)
sd_pipeline.vae.to(torch.float32)
else:
sd_pipeline.enable_model_cpu_offload()
sd_pipeline.vae.to(torch.float32)
print("SDXL model loaded!")
@spaces.GPU()
def generate_text_to_image(
prompt: str, negative_prompt: str = "", num_inference_steps: int = 30
):
"""Generate image from text prompt using SDXL."""
print(f"Generating image from prompt: {prompt}")
# Generate image
with torch.no_grad():
if device == "cuda":
# Ensure VAE is in float32
sd_pipeline.vae.to(torch.float32)
# Temporarily override VAE's forward to ensure float32 decoding
original_vae_decode = sd_pipeline.vae.decode
def vae_decode_wrapper(latents, *args, **kwargs):
# Ensure latents are in float32 for decoding
if latents.dtype != torch.float32:
latents = latents.to(torch.float32)
# Disable autocast for VAE decoding
with torch.cuda.amp.autocast(enabled=False):
return original_vae_decode(latents, *args, **kwargs)
sd_pipeline.vae.decode = vae_decode_wrapper
try:
result = sd_pipeline(
prompt=prompt,
negative_prompt=negative_prompt if negative_prompt else None,
num_inference_steps=num_inference_steps,
)
image = result.images[0]
finally:
# Restore original decode method
sd_pipeline.vae.decode = original_vae_decode
else:
result = sd_pipeline(
prompt=prompt,
negative_prompt=negative_prompt if negative_prompt else None,
num_inference_steps=num_inference_steps,
)
image = result.images[0]
return image
def create_batch(input_image: Image) -> dict[str, Any]:
"""Create batch for SF3D model - matches official app structure."""
img_cond = (
torch.from_numpy(
np.asarray(input_image.resize((COND_WIDTH, COND_HEIGHT))).astype(np.float32)
/ 255.0
)
.float()
.clip(0, 1)
)
mask_cond = img_cond[:, :, -1:]
rgb_cond = torch.lerp(
torch.tensor(BACKGROUND_COLOR)[None, None, :], img_cond[:, :, :3], mask_cond
)
batch_elem = {
"rgb_cond": rgb_cond,
"mask_cond": mask_cond,
"c2w_cond": c2w_cond.unsqueeze(0),
"intrinsic_cond": intrinsic.unsqueeze(0),
"intrinsic_normed_cond": intrinsic_normed_cond.unsqueeze(0),
}
# Add batch dim
batched = {k: v.unsqueeze(0) for k, v in batch_elem.items()}
return batched
def run_model(input_image, remesh_option, vertex_count, texture_size):
"""Run SF3D model - matches official app structure."""
start = time.time()
with torch.no_grad():
with (
torch.autocast(device_type=device, dtype=torch.bfloat16)
if "cuda" in device
else nullcontext()
):
model_batch = create_batch(input_image)
model_batch = {k: v.to(device) for k, v in model_batch.items()}
trimesh_mesh, _glob_dict = sf3d_model.generate_mesh(
model_batch, texture_size, remesh_option.lower(), vertex_count
)
trimesh_mesh = trimesh_mesh[0]
# Create new tmp file in Gradio temp directory for proper serving
os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True)
tmp_file = tempfile.NamedTemporaryFile(
delete=False, suffix=".glb", dir=os.environ["GRADIO_TEMP_DIR"]
)
trimesh_mesh.export(tmp_file.name, file_type="glb", include_normals=True)
generated_files.append(tmp_file.name)
print("Generation took:", time.time() - start, "s")
print(f"GLB file saved to: {tmp_file.name}")
return tmp_file.name
@spaces.GPU()
def generate_3d_from_image(
input_image: Image.Image,
remesh_option: str = "none",
vertex_count: int = -1,
texture_size: int = 1024,
) -> str:
"""Generate 3D mesh from image using SF3D with built-in background removal."""
# Convert to RGB if needed (SDXL outputs RGB)
if input_image.mode != "RGB":
input_image = input_image.convert("RGB")
# Use SF3D's built-in background removal
# This handles the conversion to RGBA and background removal
print("Removing background using SF3D's built-in function...")
image_with_bg_removed = sf3d_utils.remove_background(input_image)
# Resize foreground if needed (like official app)
foreground_ratio = 0.85
processed_image = sf3d_utils.resize_foreground(
image_with_bg_removed, foreground_ratio, out_size=(COND_WIDTH, COND_HEIGHT)
)
return run_model(processed_image, remesh_option, vertex_count, texture_size)
# Gradio Interface Functions
def step1_generate_image(prompt, negative_prompt, num_steps):
"""Step 1: Generate image from text."""
if not prompt:
return None, None
try:
image = generate_text_to_image(prompt, negative_prompt, num_steps)
return (
image,
image, # Auto-fill Step 2 image input
)
except Exception as e:
return None, None
def step2_generate_3d(image, remesh_option, vertex_count, texture_size):
"""Step 2: Generate 3D model from image (with built-in background removal)."""
if image is None:
return (
None,
None,
)
try:
glb_file = generate_3d_from_image(
image, remesh_option, vertex_count, texture_size
)
return (
glb_file, # Direct file path for LitModel3D
glb_file, # Also return for file download component
)
except Exception as e:
return (
None,
None,
)
# Create Gradio Interface
custom_css = """
.container {
max-width: 50%;
margin: 0 auto;
}
.container textarea[data-testid*="textbox"],
.container input[type="text"] {
width: 100% !important;
box-sizing: border-box;
}
@media (max-width: 768px) {
.container {
max-width: 100%;
}
}
"""
with gr.Blocks(title="Text to Image to 3D", css=custom_css) as demo:
# Wrap all content including header in a centered container
with gr.Column(elem_classes=["container"]):
gr.Markdown(
"""
# Text to Image to 3D Generation
This app allows you to generate 3D models from text prompts in two steps:
1. **Text to Image**: Generate an image using Stable Diffusion XL
2. **3D Generation**: Create a 3D mesh model using Stable Fast 3D (with automatic background removal)
**Instructions:**
- Enter your text prompt and generate an image
- Review the generated image and continue to generate the 3D model
- Background removal is handled automatically by Stable Fast 3D
- View and download your 3D model as a GLB file
"""
)
# Step 1: Text to Image
gr.Markdown("## Step 1: Text to Image")
# Image generation form
prompt = gr.Textbox(
label="Prompt",
placeholder="A cute robot character, 3D render, colorful",
lines=2,
)
negative_prompt = gr.Textbox(
label="Negative Prompt (optional)",
placeholder="blurry, low quality, distorted",
lines=2,
)
num_steps = gr.Slider(
label="Number of Inference Steps",
minimum=20,
maximum=50,
value=30,
step=5,
)
generate_btn = gr.Button("Generate Image", variant="primary")
# Image preview
step1_image = gr.Image(label="Generated Image", type="pil")
# Step 2: 3D Generation
gr.Markdown("## Step 2: 3D Generation")
gr.Markdown(
"*Background removal is handled automatically. You can use the image from Step 1 or upload your own image.*"
)
# 3D generation input image
step2_image_input = gr.Image(
label="Input Image",
type="pil",
sources=["upload", "clipboard"],
)
# 3D generation form
remesh_option = gr.Radio(
choices=["none", "triangle", "quad"],
label="Remeshing Option",
value="none",
)
vertex_count = gr.Slider(
label="Target Vertex Count (-1 for auto)",
minimum=-1,
maximum=20000,
value=-1,
step=100,
)
texture_size = gr.Slider(
label="Texture Size",
minimum=512,
maximum=2048,
value=1024,
step=256,
)
step2_generate_btn = gr.Button("Generate 3D Model", variant="primary")
# 3D model preview
step2_output = LitModel3D(
label="3D Model Preview",
visible=True,
clear_color=[0.0, 0.0, 0.0, 0.0],
height=600, # Set explicit height for better visibility
)
# File download component
step2_download = gr.File(
label="Download 3D Model (GLB)",
visible=True,
)
# Event handlers
generate_btn.click(
fn=step1_generate_image,
inputs=[prompt, negative_prompt, num_steps],
outputs=[step1_image, step2_image_input],
)
step2_generate_btn.click(
fn=step2_generate_3d,
inputs=[step2_image_input, remesh_option, vertex_count, texture_size],
outputs=[step2_output, step2_download],
)
if __name__ == "__main__":
# Delete previous gradio temp dir folder (like official app)
if os.path.exists(os.environ["GRADIO_TEMP_DIR"]):
print(f"Deleting {os.environ['GRADIO_TEMP_DIR']}")
import shutil
shutil.rmtree(os.environ["GRADIO_TEMP_DIR"])
demo.queue()
demo.launch(share=False)