| |
| """ |
| Sync BitTransformerLM repository to HuggingFace Hub for OS launch. |
| Uploads all cleaned documentation and code with proper commit message. |
| """ |
|
|
| import os |
| import logging |
| import re |
| from pathlib import Path |
| from huggingface_hub import HfApi, login |
| from typing import Optional, List |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| def scan_for_secrets(file_path: Path) -> List[str]: |
| """Scan a file for potential secrets and tokens.""" |
| secrets_found = [] |
|
|
| |
| secret_patterns = { |
| 'HuggingFace Token': r'hf_[A-Za-z0-9_]{30,}', |
| 'OpenAI API Key': r'sk-[A-Za-z0-9]{48}', |
| 'GitHub Token': r'gh[pousr]_[A-Za-z0-9_]{36,}', |
| 'AWS Access Key': r'AKIA[0-9A-Z]{16}', |
| 'Generic API Key': r'["\']?[Aa]pi[_-]?[Kk]ey["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?', |
| 'Generic Token': r'["\']?[Tt]oken["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?', |
| 'Generic Secret': r'["\']?[Ss]ecret["\']?\s*[:=]\s*["\']?[A-Za-z0-9_\-]{20,}["\']?', |
| } |
|
|
| try: |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
| content = f.read() |
|
|
| for secret_type, pattern in secret_patterns.items(): |
| matches = re.finditer(pattern, content, re.IGNORECASE) |
| for match in matches: |
| line_num = content[:match.start()].count('\n') + 1 |
| secrets_found.append(f"{secret_type} found at line {line_num}: {match.group()[:50]}...") |
|
|
| except Exception as e: |
| logger.warning(f"Could not scan {file_path} for secrets: {e}") |
|
|
| return secrets_found |
|
|
|
|
| def get_files_to_sync(repo_root: Path) -> List[Path]: |
| """Get the exact list of files that will be synced to HuggingFace.""" |
| |
| include_patterns = [ |
| |
| "bit_transformer/**/*.py", |
| "tests/**/*.py", |
| "scripts/**/*.py", |
| "scripts/**/*.md", |
|
|
| |
| "*.py", |
| "*.md", |
| "*.txt", |
| "*.toml", |
| "*.sh", |
| "Dockerfile", |
|
|
| |
| "LICENSE/**/*", |
| ] |
|
|
| |
| exclude_patterns = [ |
| "__pycache__/**", |
| "*.pyc", |
| ".git/**", |
| ".pytest_cache/**", |
| ".ipynb_checkpoints/**", |
| "weights/**", |
| "checkpoints/**", |
| "*.log", |
| "*.pt", |
| "*.zip", |
| |
| "*-checkpoint.*", |
| "*.tmp", |
| "*.swp", |
| |
| ".DS_Store", |
| "Thumbs.db", |
| ] |
|
|
| |
| files_to_upload = [] |
| for pattern in include_patterns: |
| for file_path in repo_root.glob(pattern): |
| if file_path.is_file(): |
| |
| relative_path = file_path.relative_to(repo_root) |
| should_exclude = any( |
| relative_path.match(exclude) |
| for exclude in exclude_patterns |
| ) |
| if not should_exclude: |
| files_to_upload.append(file_path) |
|
|
| return sorted(files_to_upload) |
|
|
|
|
| def preview_sync(repo_root: Path = None) -> None: |
| """Preview what files will be synced without actually uploading.""" |
| if repo_root is None: |
| repo_root = Path(__file__).parent.parent.parent |
|
|
| files_to_upload = get_files_to_sync(repo_root) |
|
|
| print(f"\nπ Repository root: {repo_root}") |
| print(f"π¦ Files to sync: {len(files_to_upload)}") |
| print("\nπ File list:") |
|
|
| for file_path in files_to_upload: |
| relative_path = file_path.relative_to(repo_root) |
| file_size = file_path.stat().st_size |
| print(f" {relative_path} ({file_size:,} bytes)") |
|
|
| total_size = sum(f.stat().st_size for f in files_to_upload) |
| print(f"\nπ Total size: {total_size:,} bytes ({total_size/1024/1024:.2f} MB)") |
|
|
|
|
| def sync_repository_to_hf( |
| repo_id: str = "WCNegentropy/BitTransformerLM", |
| token: Optional[str] = None, |
| commit_message: str = "π Refined BitTransformerLM: Organized codebase with best practices", |
| preview_only: bool = False |
| ): |
| """ |
| Sync the entire cleaned BitTransformerLM repository to HuggingFace Hub. |
| |
| Args: |
| repo_id: HuggingFace repository ID |
| token: HF token (defaults to HF_TOKEN environment variable) |
| commit_message: Commit message for the upload |
| """ |
| |
| |
| if token is None: |
| token = os.environ.get('HF_TOKEN') |
| if not token: |
| logger.error("HF_TOKEN environment variable not set and no token provided") |
| return False |
| |
| try: |
| |
| login(token=token) |
| api = HfApi() |
| logger.info("Successfully authenticated with HuggingFace Hub") |
| |
| |
| repo_root = Path(__file__).parent.parent.parent |
| logger.info(f"Repository root: {repo_root}") |
|
|
| |
| files_to_upload = get_files_to_sync(repo_root) |
| logger.info(f"Found {len(files_to_upload)} files to upload") |
|
|
| |
| logger.info("π Scanning files for secrets and tokens...") |
| all_secrets = [] |
| for file_path in files_to_upload: |
| secrets = scan_for_secrets(file_path) |
| if secrets: |
| relative_path = file_path.relative_to(repo_root) |
| all_secrets.extend([f"{relative_path}: {secret}" for secret in secrets]) |
|
|
| if all_secrets: |
| logger.error("π¨ SECURITY ALERT: Secrets detected in files!") |
| logger.error("The following secrets were found and MUST be removed before sync:") |
| for secret in all_secrets: |
| logger.error(f" - {secret}") |
| logger.error("β SYNC ABORTED for security reasons!") |
| logger.error("Please remove all secrets and use environment variables instead.") |
| return False |
|
|
| logger.info("β
Security scan passed - no secrets detected") |
|
|
| |
| if preview_only: |
| preview_sync(repo_root) |
| return True |
| |
| |
| logger.info("Syncing entire repository structure to HuggingFace...") |
|
|
| try: |
| |
| import tempfile |
| import shutil |
|
|
| with tempfile.TemporaryDirectory() as temp_dir: |
| temp_path = Path(temp_dir) |
|
|
| |
| for file_path in files_to_upload: |
| relative_path = file_path.relative_to(repo_root) |
| dest_path = temp_path / relative_path |
| dest_path.parent.mkdir(parents=True, exist_ok=True) |
| shutil.copy2(file_path, dest_path) |
|
|
| logger.info(f"Prepared {len(files_to_upload)} files for upload") |
|
|
| |
| api.upload_folder( |
| folder_path=str(temp_path), |
| repo_id=repo_id, |
| repo_type="model", |
| commit_message=commit_message, |
| commit_description=""" |
| BitTransformerLM refined with ML engineering best practices: |
| |
| β
**Organized Codebase Structure** |
| - Cleaned up 30+ scattered scripts into organized directories |
| - Standardized imports and docstring formatting |
| - Consolidated configuration management |
| - Professional package metadata |
| |
| β
**Enhanced Developer Experience** |
| - Comprehensive CLI interface with standardized arguments |
| - Type-safe configuration system with presets |
| - Improved error handling and logging |
| - Better modular organization |
| |
| β
**Production Quality** |
| - PyProject.toml with proper dependencies and tooling |
| - Consistent code formatting and documentation |
| - Maintainable directory structure |
| - Ready for serious development and research |
| |
| The bit-native transformer architecture with reversible layers, safety telemetry, |
| and distributed training capabilities is now properly packaged for research use. |
| """.strip(), |
| delete_patterns=["*"] |
| ) |
|
|
| uploaded_count = len(files_to_upload) |
|
|
| except Exception as e: |
| logger.error(f"Failed to upload folder: {e}") |
| logger.info("Falling back to individual file upload...") |
|
|
| |
| uploaded_count = 0 |
| for file_path in files_to_upload: |
| try: |
| relative_path = file_path.relative_to(repo_root) |
| logger.info(f"Uploading: {relative_path}") |
|
|
| api.upload_file( |
| path_or_fileobj=str(file_path), |
| path_in_repo=str(relative_path), |
| repo_id=repo_id, |
| repo_type="model", |
| commit_message=commit_message, |
| ) |
|
|
| uploaded_count += 1 |
| if uploaded_count % 10 == 0: |
| logger.info(f"Progress: {uploaded_count}/{len(files_to_upload)} files uploaded") |
|
|
| except Exception as e: |
| logger.warning(f"Failed to upload {relative_path}: {e}") |
| continue |
| |
| logger.info(f"β
Successfully uploaded {uploaded_count}/{len(files_to_upload)} files") |
| logger.info(f"π Repository synced to: https://huggingface.co/{repo_id}") |
| |
| return True |
| |
| except Exception as e: |
| logger.error(f"β Failed to sync repository: {e}") |
| return False |
|
|
| def create_release_info(): |
| """Create a release information file for the OS launch.""" |
| release_info = """# BitTransformerLM v0.1.0 - Experimental Research Release |
| |
| **Release Date:** August 2025 |
| **Status:** Open Source Research Implementation |
| **License:** AGPLv3 + Commercial Licensing Available |
| |
| ## What's Included |
| |
| This release provides a complete experimental framework for bit-native language modeling research: |
| |
| - **Core Architecture:** 57 Python files implementing bit-native transformer with reversible layers |
| - **Safety Systems:** Real-time K/C/S telemetry and monitoring |
| - **Research Tools:** Interactive dashboard, distributed training, comprehensive testing |
| - **Documentation:** Professional model card, research status, and validation reports |
| |
| ## Important Notes |
| |
| β οΈ **Experimental Status:** This is research code requiring rigorous baseline validation |
| β οΈ **Not Production Ready:** Needs extensive evaluation vs standard transformers |
| β οΈ **Research Use Only:** Intended for academic investigation and experimentation |
| |
| ## Licensing |
| |
| - **Open Source:** AGPLv3 for research and open source use |
| - **Commercial:** Contact contact@wcnegentropy.com for commercial licensing |
| |
| ## Next Steps |
| |
| The research community is invited to: |
| 1. Conduct rigorous baseline comparisons vs standard transformers |
| 2. Evaluate on established language modeling benchmarks |
| 3. Validate (or refute) claimed memory efficiency benefits |
| 4. Share findings openly to advance the field |
| |
| **Research responsibly. Validate rigorously. Share openly.** |
| """ |
| |
| release_file = Path(__file__).parent / "RELEASE_INFO.md" |
| with open(release_file, 'w') as f: |
| f.write(release_info) |
| |
| logger.info("Created RELEASE_INFO.md") |
| return release_file |
|
|
| if __name__ == "__main__": |
| import argparse |
|
|
| parser = argparse.ArgumentParser(description="Sync BitTransformerLM to HuggingFace Hub") |
| parser.add_argument("--preview", action="store_true", help="Preview files without uploading") |
| parser.add_argument("--repo-id", default="WCNegentropy/BitTransformerLM", help="HuggingFace repo ID") |
| parser.add_argument("--token", help="HuggingFace token (or set HF_TOKEN env var)") |
| args = parser.parse_args() |
|
|
| if args.preview: |
| print("π Preview mode: showing files that would be synced...") |
| preview_sync() |
| print("\nβ
Use --token YOUR_TOKEN to perform actual sync") |
| else: |
| |
| create_release_info() |
|
|
| |
| success = sync_repository_to_hf( |
| repo_id=args.repo_id, |
| token=args.token |
| ) |
|
|
| if success: |
| print(f"\nπ BitTransformerLM Sync Complete!") |
| print(f"π Repository: https://huggingface.co/{args.repo_id}") |
| print("\nRefined codebase with ML engineering best practices is now live! β¨") |
| else: |
| print("\nβ Sync failed. Please check logs and try again.") |