| | """ |
| | Generate synthetic training data for Code Comment Quality Classifier |
| | """ |
| | import pandas as pd |
| | import os |
| | import random |
| |
|
| |
|
| | |
| | EXCELLENT_COMMENTS = [ |
| | "This function calculates the Fibonacci sequence using dynamic programming to avoid redundant calculations. Time complexity: O(n), Space complexity: O(n)", |
| | "Validates user input against SQL injection attacks using parameterized queries. Returns True if safe, False otherwise. Raises ValueError for invalid input types.", |
| | "Binary search implementation for sorted arrays. Uses divide-and-conquer approach. Params: arr (sorted list), target (value). Returns: index or -1 if not found.", |
| | "Implements the Singleton pattern to ensure only one instance of DatabaseConnection exists. Thread-safe using double-checked locking.", |
| | "Parses JSON configuration file and validates against schema. Handles nested objects and arrays. Raises ConfigurationError if validation fails.", |
| | "Asynchronous HTTP request handler with retry logic and exponential backoff. Max retries: 3. Timeout: 30s. Returns: Response object or None on failure.", |
| | "Generates secure random tokens for authentication using CSPRNG. Length: 32 bytes. Returns: hex-encoded string. Used in password reset flows.", |
| | "Custom hook that debounces state updates to prevent excessive re-renders. Delay: configurable ms. Returns: debounced value and setter function.", |
| | "Optimized matrix multiplication using Strassen's algorithm. Suitable for large matrices (n > 64). Time complexity: O(n^2.807).", |
| | "Decorator that caches function results with LRU eviction policy. Max size: 128 entries. Thread-safe. Improves performance for expensive computations.", |
| | ] |
| |
|
| | HELPFUL_COMMENTS = [ |
| | "Calculates the sum of two numbers and returns the result", |
| | "This function sorts the array in ascending order", |
| | "Checks if the user is logged in before proceeding", |
| | "Converts temperature from Celsius to Fahrenheit", |
| | "Returns the current timestamp in UTC format", |
| | "Validates email format using regex pattern", |
| | "Fetches user data from the database by ID", |
| | "Updates the UI when data changes", |
| | "Handles file upload and saves to storage", |
| | "Generates a random string of specified length", |
| | "Removes duplicates from the list", |
| | "Encrypts password before storing in database", |
| | "Sends email notification to user", |
| | "Formats date string for display", |
| | "Calculates total price including tax", |
| | ] |
| |
|
| | UNCLEAR_COMMENTS = [ |
| | "does stuff", |
| | "magic happens here", |
| | "don't touch this", |
| | "idk why this works but it does", |
| | "temporary solution", |
| | "quick fix", |
| | "handles things", |
| | "processes data", |
| | "important function", |
| | "legacy code", |
| | "weird edge case", |
| | "not sure what this does", |
| | "complicated logic", |
| | "TODO", |
| | "fix me", |
| | "helper method", |
| | "utility function", |
| | "wrapper", |
| | "handler", |
| | "manager", |
| | ] |
| |
|
| | OUTDATED_COMMENTS = [ |
| | "DEPRECATED: Use the new API endpoint instead", |
| | "This will be removed in version 2.0", |
| | "TODO: Refactor this to use async/await", |
| | "Old implementation - kept for backwards compatibility", |
| | "NOTE: This approach is no longer recommended", |
| | "FIXME: Memory leak issue - needs update", |
| | "Uses legacy authentication system", |
| | "WARNING: This method is obsolete", |
| | "Replaced by getUserInfo() in v1.5", |
| | "Temporary workaround - pending proper fix", |
| | "DEPRECATED: Direct database access - use ORM instead", |
| | "Old validation logic - update to new schema", |
| | "Uses outdated library - migrate to modern alternative", |
| | "This was for Python 2 compatibility", |
| | "FIXME: Security vulnerability - needs immediate update", |
| | ] |
| |
|
| |
|
| | def generate_variations(base_comments: list, num_variations: int = 5) -> list: |
| | """Generate variations of base comments to increase dataset size.""" |
| | variations = [] |
| | |
| | prefixes = ["", "Note: ", "Important: ", "Info: ", ""] |
| | suffixes = ["", ".", "...", " // end", ""] |
| | |
| | for comment in base_comments: |
| | variations.append(comment) |
| | for _ in range(num_variations - 1): |
| | prefix = random.choice(prefixes) |
| | suffix = random.choice(suffixes) |
| | varied = f"{prefix}{comment}{suffix}" |
| | variations.append(varied) |
| | |
| | return variations |
| |
|
| |
|
| | def generate_dataset(output_path: str = "./data/comments.csv", samples_per_class: int = 250): |
| | """ |
| | Generate synthetic training dataset. |
| | |
| | Args: |
| | output_path: Path to save the CSV file |
| | samples_per_class: Number of samples to generate per class |
| | """ |
| | print("=" * 60) |
| | print("Generating Synthetic Training Data") |
| | print("=" * 60) |
| | |
| | |
| | os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| | |
| | |
| | print("\nGenerating comment variations...") |
| | excellent_samples = generate_variations(EXCELLENT_COMMENTS, samples_per_class // len(EXCELLENT_COMMENTS)) |
| | helpful_samples = generate_variations(HELPFUL_COMMENTS, samples_per_class // len(HELPFUL_COMMENTS)) |
| | unclear_samples = generate_variations(UNCLEAR_COMMENTS, samples_per_class // len(UNCLEAR_COMMENTS)) |
| | outdated_samples = generate_variations(OUTDATED_COMMENTS, samples_per_class // len(OUTDATED_COMMENTS)) |
| | |
| | |
| | excellent_samples = excellent_samples[:samples_per_class] |
| | helpful_samples = helpful_samples[:samples_per_class] |
| | unclear_samples = unclear_samples[:samples_per_class] |
| | outdated_samples = outdated_samples[:samples_per_class] |
| | |
| | |
| | data = { |
| | 'comment': ( |
| | excellent_samples + |
| | helpful_samples + |
| | unclear_samples + |
| | outdated_samples |
| | ), |
| | 'label': ( |
| | ['excellent'] * len(excellent_samples) + |
| | ['helpful'] * len(helpful_samples) + |
| | ['unclear'] * len(unclear_samples) + |
| | ['outdated'] * len(outdated_samples) |
| | ) |
| | } |
| | |
| | df = pd.DataFrame(data) |
| | |
| | |
| | df = df.sample(frac=1, random_state=42).reset_index(drop=True) |
| | |
| | |
| | df.to_csv(output_path, index=False) |
| | |
| | print(f"\n✓ Dataset generated successfully!") |
| | print(f"✓ Total samples: {len(df)}") |
| | print(f"✓ Saved to: {output_path}") |
| | |
| | print("\nClass distribution:") |
| | print(df['label'].value_counts().sort_index()) |
| | |
| | print("\nSample comments:") |
| | print("-" * 60) |
| | for label in ['excellent', 'helpful', 'unclear', 'outdated']: |
| | sample = df[df['label'] == label].iloc[0]['comment'] |
| | print(f"\n[{label.upper()}]") |
| | print(f" {sample}") |
| | |
| | print("\n" + "=" * 60) |
| | print("Data generation complete! 🎉") |
| | print("=" * 60) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import argparse |
| | |
| | parser = argparse.ArgumentParser(description="Generate synthetic training data") |
| | parser.add_argument( |
| | "--output", |
| | type=str, |
| | default="./data/comments.csv", |
| | help="Output path for the CSV file" |
| | ) |
| | parser.add_argument( |
| | "--samples-per-class", |
| | type=int, |
| | default=250, |
| | help="Number of samples to generate per class" |
| | ) |
| | args = parser.parse_args() |
| | |
| | generate_dataset(args.output, args.samples_per_class) |
| |
|