| |
| """ |
| Simple test script to verify GLEN environment is ready for The Vault dataset |
| """ |
|
|
| import os |
| import sys |
| import torch |
| import pandas as pd |
| from pathlib import Path |
|
|
| def test_dependencies(): |
| """Test if all required dependencies are installed""" |
| print("Testing dependencies...") |
| |
| try: |
| import transformers |
| print(f"β
transformers: {transformers.__version__}") |
| except ImportError: |
| print("β transformers not found") |
| return False |
| |
| try: |
| import torch |
| print(f"β
torch: {torch.__version__}") |
| print(f"β
CUDA available: {torch.cuda.is_available()}") |
| if torch.cuda.is_available(): |
| print(f"β
GPU: {torch.cuda.get_device_name(0)}") |
| except ImportError: |
| print("β torch not found") |
| return False |
| |
| try: |
| import pandas |
| print(f"β
pandas: {pandas.__version__}") |
| except ImportError: |
| print("β pandas not found") |
| return False |
| |
| try: |
| import wandb |
| print(f"β
wandb: {wandb.__version__}") |
| except ImportError: |
| print("β wandb not found") |
| return False |
| |
| return True |
|
|
| def test_data_files(): |
| """Test if required data files exist""" |
| print("\nTesting data files...") |
| |
| data_dir = Path("data/the_vault") |
| required_files = [ |
| "DOC_VAULT_train.tsv", |
| "GTQ_VAULT_train.tsv", |
| "ID_VAULT_t5_bm25_truncate_3.tsv", |
| "DOC_VAULT_validate.tsv", |
| "GTQ_VAULT_dev.tsv" |
| ] |
| |
| all_found = True |
| for file_name in required_files: |
| file_path = data_dir / file_name |
| if file_path.exists(): |
| size = file_path.stat().st_size / 1024 |
| print(f"β
{file_name} ({size:.1f} KB)") |
| else: |
| print(f"β {file_name} not found") |
| all_found = False |
| |
| return all_found |
|
|
| def test_tevatron_imports(): |
| """Test if tevatron modules can be imported""" |
| print("\nTesting tevatron imports...") |
| |
| try: |
| from tevatron.arguments import ( |
| GLENP1ModelArguments, |
| GLENP1DataArguments, |
| GLENP1TrainingArguments |
| ) |
| print("β
Phase 1 arguments imported") |
| except ImportError as e: |
| print(f"β Phase 1 arguments import failed: {e}") |
| return False |
| |
| try: |
| from tevatron.utils.gpu_monitor import GPUMemoryMonitor |
| print("β
GPU monitor imported") |
| except ImportError as e: |
| print(f"β GPU monitor import failed: {e}") |
| return False |
| |
| return True |
|
|
| def test_gpu_monitor(): |
| """Test GPU memory monitor functionality""" |
| print("\nTesting GPU monitor...") |
| |
| try: |
| from tevatron.utils.gpu_monitor import GPUMemoryMonitor |
| |
| monitor = GPUMemoryMonitor(memory_threshold=0.8, check_interval=10) |
| stats = monitor.get_memory_stats() |
| |
| if stats["enabled"]: |
| print(f"β
GPU monitor enabled") |
| print(f" - Total GPU memory: {stats['total_gb']:.2f} GB") |
| print(f" - Current usage: {stats['usage_ratio']:.1%}") |
| |
| |
| can_continue = monitor.check_memory() |
| print(f" - Memory check passed: {can_continue}") |
| else: |
| print("β οΈ GPU monitor disabled (no CUDA)") |
| |
| return True |
| except Exception as e: |
| print(f"β GPU monitor test failed: {e}") |
| return False |
|
|
| def test_data_loading(): |
| """Test loading a sample of data""" |
| print("\nTesting data loading...") |
| |
| try: |
| train_doc_path = "data/the_vault/DOC_VAULT_train.tsv" |
| if os.path.exists(train_doc_path): |
| df = pd.read_csv(train_doc_path, sep='\t', nrows=5) |
| print(f"β
Loaded {len(df)} sample documents") |
| print(f" - Columns: {list(df.columns)}") |
| |
| |
| if 'doc_content' in df.columns and len(df['doc_content'].iloc[0]) > 50: |
| print("β
Document content looks valid") |
| else: |
| print("β οΈ Document content might be too short") |
| |
| return True |
| except Exception as e: |
| print(f"β Data loading test failed: {e}") |
| return False |
|
|
| def main(): |
| print("π§ͺ GLEN Environment Test for The Vault Dataset") |
| print("=" * 50) |
| |
| tests = [ |
| ("Dependencies", test_dependencies), |
| ("Data Files", test_data_files), |
| ("Tevatron Imports", test_tevatron_imports), |
| ("GPU Monitor", test_gpu_monitor), |
| ("Data Loading", test_data_loading) |
| ] |
| |
| passed = 0 |
| total = len(tests) |
| |
| for test_name, test_func in tests: |
| print(f"\nπ {test_name}") |
| print("-" * 30) |
| if test_func(): |
| passed += 1 |
| print(f"β
{test_name} PASSED") |
| else: |
| print(f"β {test_name} FAILED") |
| |
| print("\n" + "=" * 50) |
| print(f"π― Test Results: {passed}/{total} tests passed") |
| |
| if passed == total: |
| print("π Environment is ready for GLEN training!") |
| print("\nNext steps:") |
| print("1. Run full preprocessing if needed:") |
| print(" python scripts/preprocess_vault_dataset.py --input_dir the_vault_dataset/ --output_dir data/the_vault/") |
| print("2. Start training:") |
| print(" bash scripts/train_glen_p1_vault.sh") |
| return True |
| else: |
| print("β οΈ Some tests failed. Please fix the issues above.") |
| return False |
|
|
| if __name__ == "__main__": |
| success = main() |
| sys.exit(0 if success else 1) |