DeepXR
/

Helion-V2.5-Rnd

Text Generation

text-generation-inference

Model card Files Files and versions

Helion-V2.5-Rnd / Dockerfile

Trouter-Library's picture

Trouter-Library

Create Dockerfile

51402bc verified 4 months ago

history blame contribute delete

3.83 kB

	# Multi-stage build for DeepXR/Helion-2.5-Rnd
	# Optimized for production inference with vLLM

	# Stage 1: Base image with CUDA and Python
	FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base

	# Set environment variables
	ENV DEBIAN_FRONTEND=noninteractive \
	PYTHONUNBUFFERED=1 \
	CUDA_HOME=/usr/local/cuda \
	TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \
	FORCE_CUDA=1 \
	MAX_JOBS=8

	# Install system dependencies
	RUN apt-get update && apt-get install -y \
	python3.10 \
	python3-pip \
	python3.10-dev \
	git \
	wget \
	curl \
	vim \
	build-essential \
	cmake \
	ninja-build \
	ccache \
	libssl-dev \
	libffi-dev \
	libjpeg-dev \
	libpng-dev \
	libgomp1 \
	&& rm -rf /var/lib/apt/lists/*

	# Update pip and install build tools
	RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel

	# Stage 2: Build dependencies
	FROM base AS builder

	WORKDIR /build

	# Install PyTorch with CUDA support
	RUN pip install --no-cache-dir \
	torch==2.2.0 \
	torchvision==0.17.0 \
	torchaudio==2.2.0 \
	--index-url https://download.pytorch.org/whl/cu121

	# Install vLLM and core dependencies
	RUN pip install --no-cache-dir \
	vllm==0.3.3 \
	transformers==4.40.0 \
	tokenizers==0.15.2 \
	sentencepiece==0.2.0 \
	accelerate==0.28.0 \
	bitsandbytes==0.43.0 \
	safetensors==0.4.2 \
	huggingface-hub==0.21.4

	# Install additional ML libraries
	RUN pip install --no-cache-dir \
	numpy==1.26.4 \
	scipy==1.12.0 \
	pandas==2.2.1 \
	scikit-learn==1.4.1 \
	pydantic==2.6.4 \
	fastapi==0.110.0 \
	uvicorn[standard]==0.29.0 \
	aiohttp==3.9.3 \
	ray[default]==2.10.0

	# Install monitoring and optimization tools
	RUN pip install --no-cache-dir \
	prometheus-client==0.20.0 \
	gputil==1.4.0 \
	psutil==5.9.8 \
	py-cpuinfo==9.0.0 \
	pynvml==11.5.0

	# Stage 3: Final runtime image
	FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04

	# Copy environment variables
	ENV DEBIAN_FRONTEND=noninteractive \
	PYTHONUNBUFFERED=1 \
	CUDA_HOME=/usr/local/cuda \
	MODEL_NAME=DeepXR/Helion-2.5-Rnd \
	MODEL_PATH=/models/helion \
	PORT=8000 \
	HOST=0.0.0.0 \
	TENSOR_PARALLEL_SIZE=2 \
	MAX_MODEL_LEN=131072 \
	GPU_MEMORY_UTILIZATION=0.95 \
	WORKERS=1

	# Install runtime dependencies only
	RUN apt-get update && apt-get install -y \
	python3.10 \
	python3-pip \
	curl \
	vim \
	libgomp1 \
	&& rm -rf /var/lib/apt/lists/*

	# Copy Python packages from builder
	COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
	COPY --from=builder /usr/local/bin /usr/local/bin

	# Create application directory
	WORKDIR /app

	# Create necessary directories
	RUN mkdir -p /models/helion /app/inference /app/logs /app/cache

	# Copy inference code
	COPY ./inference /app/inference
	COPY ./model_config.yaml /app/
	COPY ./config.json /app/

	# Set permissions
	RUN chmod +x /app/inference/*.py

	# Create non-root user for security
	RUN useradd -m -u 1000 helion && \
	chown -R helion:helion /app /models

	USER helion

	# Health check
	HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
	CMD curl -f http://localhost:${PORT}/health \|\| exit 1

	# Expose ports
	EXPOSE 8000 8001 8002

	# Set default command
	CMD ["python3", "-m", "inference.server", \
	"--model", "${MODEL_PATH}", \
	"--host", "${HOST}", \
	"--port", "${PORT}", \
	"--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \
	"--max-model-len", "${MAX_MODEL_LEN}", \
	"--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"]

	# Labels
	LABEL maintainer="DeepXR Team" \
	version="2.5.0-rnd" \
	description="Helion-2.5 Research & Development Model - Advanced Language Model" \
	model="DeepXR/Helion-2.5-Rnd" \
	license="Apache-2.0"