| #!/bin/bash |
| |
| |
|
|
| set -e |
|
|
| echo "🚀 OpenAI Batch API 批量生成编程问题" |
| echo "========================================" |
| echo "预算: \$40" |
| echo "模型: gpt-5-nano (Batch API - 50% off) - 最便宜的选项" |
| echo "预计可生成: ~160,000+ 个样本" |
| echo "========================================" |
| echo "" |
|
|
| |
| BUDGET=40 |
| MIN_SCORE=60 |
| MODEL="gpt-5-nano" |
| INPUT_FILE="function_dataset_v2.csv" |
| BATCH_REQUESTS_FILE="batch_requests_full.jsonl" |
| BATCH_RESULTS_RAW="batch_results_raw.jsonl" |
| FINAL_OUTPUT="programming_problems_batch.jsonl" |
| BATCH_ID_FILE="batch_id.txt" |
|
|
| |
| if [ ! -f "$INPUT_FILE" ]; then |
| echo "❌ 错误: 找不到输入文件 $INPUT_FILE" |
| exit 1 |
| fi |
|
|
| if [ -z "$OPENAI_API_KEY" ]; then |
| echo "❌ 错误: OPENAI_API_KEY 环境变量未设置" |
| echo " 请运行: export OPENAI_API_KEY='your-api-key'" |
| exit 1 |
| fi |
|
|
| |
| echo "📊 步骤 1/5: 估算预算..." |
| echo "----------------------------------------" |
| python3 generate_problems_batch.py estimate \ |
| --num-requests 44000 \ |
| --avg-input-tokens 1917 \ |
| --avg-output-tokens 2552 \ |
| --model $MODEL |
|
|
| echo "" |
| read -p "👉 继续执行? (y/n) " -n 1 -r |
| echo "" |
| if [[ ! $REPLY =~ ^[Yy]$ ]]; then |
| echo "❌ 已取消" |
| exit 0 |
| fi |
|
|
| |
| echo "" |
| echo "📋 步骤 2/5: 准备批量请求..." |
| echo "----------------------------------------" |
| python3 generate_problems_batch.py prepare \ |
| --input $INPUT_FILE \ |
| --output $BATCH_REQUESTS_FILE \ |
| --min-score $MIN_SCORE \ |
| --model $MODEL |
|
|
| |
| REQUEST_COUNT=$(wc -l < $BATCH_REQUESTS_FILE) |
| echo "✅ 已准备 $REQUEST_COUNT 个请求" |
|
|
| |
| echo "" |
| echo "💰 根据实际请求数量重新估算..." |
| python3 generate_problems_batch.py estimate \ |
| --num-requests $REQUEST_COUNT \ |
| --avg-input-tokens 1917 \ |
| --avg-output-tokens 2552 \ |
| --model $MODEL |
|
|
| echo "" |
| read -p "👉 继续提交到 OpenAI? (y/n) " -n 1 -r |
| echo "" |
| if [[ ! $REPLY =~ ^[Yy]$ ]]; then |
| echo "❌ 已取消 (批量请求文件已保存: $BATCH_REQUESTS_FILE)" |
| exit 0 |
| fi |
|
|
| |
| echo "" |
| echo "🚀 步骤 3/5: 提交批处理任务到 OpenAI..." |
| echo "----------------------------------------" |
| SUBMIT_OUTPUT=$(python3 generate_problems_batch.py submit \ |
| --input $BATCH_REQUESTS_FILE \ |
| --model $MODEL \ |
| --description "Scientific computing problems - $REQUEST_COUNT samples") |
|
|
| echo "$SUBMIT_OUTPUT" |
|
|
| |
| BATCH_ID=$(echo "$SUBMIT_OUTPUT" | grep -oP 'Batch created: \K[^ ]+' || echo "$SUBMIT_OUTPUT" | grep -oP 'batch_[a-zA-Z0-9_]+' | head -1) |
|
|
| if [ -z "$BATCH_ID" ]; then |
| echo "❌ 错误: 无法获取 Batch ID" |
| echo "请手动检查输出并记录 Batch ID" |
| exit 1 |
| fi |
|
|
| echo "$BATCH_ID" > $BATCH_ID_FILE |
| echo "" |
| echo "✅ Batch ID 已保存到: $BATCH_ID_FILE" |
| echo "📝 Batch ID: $BATCH_ID" |
| echo "" |
|
|
| |
| echo "⏳ 步骤 4/5: 监控批处理状态..." |
| echo "----------------------------------------" |
| echo "批处理任务通常在几小时内完成(最多24小时)" |
| echo "您可以:" |
| echo " 1. 等待脚本自动监控(每5分钟检查一次)" |
| echo " 2. 按 Ctrl+C 退出,稍后运行监控命令:" |
| echo " python3 generate_problems_batch.py status $BATCH_ID" |
| echo "" |
|
|
| read -p "👉 是否自动监控? (y/n) " -n 1 -r |
| echo "" |
|
|
| if [[ $REPLY =~ ^[Yy]$ ]]; then |
| echo "🔍 开始自动监控..." |
| |
| while true; do |
| TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') |
| echo "" |
| echo "[$TIMESTAMP] 检查批处理状态..." |
| |
| STATUS_OUTPUT=$(python3 generate_problems_batch.py status $BATCH_ID) |
| echo "$STATUS_OUTPUT" |
| |
| |
| if echo "$STATUS_OUTPUT" | grep -q "Status: completed"; then |
| echo "" |
| echo "✅ 批处理已完成!" |
| break |
| elif echo "$STATUS_OUTPUT" | grep -q "Status: failed"; then |
| echo "" |
| echo "❌ 批处理失败!请检查错误信息" |
| exit 1 |
| elif echo "$STATUS_OUTPUT" | grep -q "Status: expired"; then |
| echo "" |
| echo "❌ 批处理已过期(超过24小时)" |
| exit 1 |
| fi |
| |
| echo "⏳ 批处理仍在进行中,5分钟后再次检查..." |
| sleep 300 |
| done |
| else |
| echo "ℹ️ 跳过自动监控" |
| echo "稍后请手动检查状态:" |
| echo " python3 generate_problems_batch.py status $BATCH_ID" |
| echo "" |
| echo "完成后运行下载和处理命令:" |
| echo " python3 generate_problems_batch.py download $BATCH_ID --output $BATCH_RESULTS_RAW" |
| echo " python3 generate_problems_batch.py process --input $BATCH_RESULTS_RAW --output $FINAL_OUTPUT" |
| exit 0 |
| fi |
|
|
| |
| echo "" |
| echo "⬇️ 步骤 5/5: 下载和处理结果..." |
| echo "----------------------------------------" |
|
|
| |
| python3 generate_problems_batch.py download $BATCH_ID \ |
| --output $BATCH_RESULTS_RAW |
|
|
| |
| python3 generate_problems_batch.py process \ |
| --input $BATCH_RESULTS_RAW \ |
| --output $FINAL_OUTPUT \ |
| --model $MODEL \ |
| --requests $BATCH_REQUESTS_FILE |
|
|
| echo "" |
| echo "========================================" |
| echo "✅ 全部完成!" |
| echo "========================================" |
| echo "最终结果文件: $FINAL_OUTPUT" |
| echo "" |
| echo "查看结果:" |
| echo " head -1 $FINAL_OUTPUT | python3 -m json.tool" |
| echo " wc -l $FINAL_OUTPUT" |
| echo "" |
| echo "Batch ID: $BATCH_ID (已保存在 $BATCH_ID_FILE)" |
| echo "========================================" |
|
|