| |
| |
|
|
| |
| FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 AS base |
|
|
| |
| ENV DEBIAN_FRONTEND=noninteractive \ |
| PYTHONUNBUFFERED=1 \ |
| CUDA_HOME=/usr/local/cuda \ |
| TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" \ |
| FORCE_CUDA=1 \ |
| MAX_JOBS=8 |
|
|
| |
| RUN apt-get update && apt-get install -y \ |
| python3.10 \ |
| python3-pip \ |
| python3.10-dev \ |
| git \ |
| wget \ |
| curl \ |
| vim \ |
| build-essential \ |
| cmake \ |
| ninja-build \ |
| ccache \ |
| libssl-dev \ |
| libffi-dev \ |
| libjpeg-dev \ |
| libpng-dev \ |
| libgomp1 \ |
| && rm -rf /var/lib/apt/lists/* |
|
|
| |
| RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel |
|
|
| |
| FROM base AS builder |
|
|
| WORKDIR /build |
|
|
| |
| RUN pip install --no-cache-dir \ |
| torch==2.2.0 \ |
| torchvision==0.17.0 \ |
| torchaudio==2.2.0 \ |
| --index-url https://download.pytorch.org/whl/cu121 |
| |
| |
| RUN pip install --no-cache-dir \ |
| vllm==0.3.3 \ |
| transformers==4.40.0 \ |
| tokenizers==0.15.2 \ |
| sentencepiece==0.2.0 \ |
| accelerate==0.28.0 \ |
| bitsandbytes==0.43.0 \ |
| safetensors==0.4.2 \ |
| huggingface-hub==0.21.4 |
| |
| |
| RUN pip install --no-cache-dir \ |
| numpy==1.26.4 \ |
| scipy==1.12.0 \ |
| pandas==2.2.1 \ |
| scikit-learn==1.4.1 \ |
| pydantic==2.6.4 \ |
| fastapi==0.110.0 \ |
| uvicorn[standard]==0.29.0 \ |
| aiohttp==3.9.3 \ |
| ray[default]==2.10.0 |
| |
| |
| RUN pip install --no-cache-dir \ |
| prometheus-client==0.20.0 \ |
| gputil==1.4.0 \ |
| psutil==5.9.8 \ |
| py-cpuinfo==9.0.0 \ |
| pynvml==11.5.0 |
| |
| |
| FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 |
| |
| |
| ENV DEBIAN_FRONTEND=noninteractive \ |
| PYTHONUNBUFFERED=1 \ |
| CUDA_HOME=/usr/local/cuda \ |
| MODEL_NAME=DeepXR/Helion-2.5-Rnd \ |
| MODEL_PATH=/models/helion \ |
| PORT=8000 \ |
| HOST=0.0.0.0 \ |
| TENSOR_PARALLEL_SIZE=2 \ |
| MAX_MODEL_LEN=131072 \ |
| GPU_MEMORY_UTILIZATION=0.95 \ |
| WORKERS=1 |
| |
| |
| RUN apt-get update && apt-get install -y \ |
| python3.10 \ |
| python3-pip \ |
| curl \ |
| vim \ |
| libgomp1 \ |
| && rm -rf /var/lib/apt/lists/* |
| |
| |
| COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages |
| COPY --from=builder /usr/local/bin /usr/local/bin |
| |
| |
| WORKDIR /app |
| |
| |
| RUN mkdir -p /models/helion /app/inference /app/logs /app/cache |
| |
| |
| COPY ./inference /app/inference |
| COPY ./model_config.yaml /app/ |
| COPY ./config.json /app/ |
| |
| |
| RUN chmod +x /app/inference/*.py |
| |
| |
| RUN useradd -m -u 1000 helion && \ |
| chown -R helion:helion /app /models |
| |
| USER helion |
| |
| |
| HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ |
| CMD curl -f http://localhost:${PORT}/health || exit 1 |
|
|
| |
| EXPOSE 8000 8001 8002 |
|
|
| |
| CMD ["python3", "-m", "inference.server", \ |
| "--model", "${MODEL_PATH}", \ |
| "--host", "${HOST}", \ |
| "--port", "${PORT}", \ |
| "--tensor-parallel-size", "${TENSOR_PARALLEL_SIZE}", \ |
| "--max-model-len", "${MAX_MODEL_LEN}", \ |
| "--gpu-memory-utilization", "${GPU_MEMORY_UTILIZATION}"] |
|
|
| |
| LABEL maintainer="DeepXR Team" \ |
| version="2.5.0-rnd" \ |
| description="Helion-2.5 Research & Development Model - Advanced Language Model" \ |
| model="DeepXR/Helion-2.5-Rnd" \ |
| license="Apache-2.0" |