| | |
| |
|
| | import json |
| | import random |
| | from pathlib import Path |
| | from collections import Counter |
| | import shutil |
| |
|
| | |
| | INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl") |
| | BACKUP_PATH = Path("data/labeled/labeled_dockerfiles_backup.jsonl") |
| | TOP_RULES_PATH = Path("data/metadata/top_rules.json") |
| | OUTPUT_PATH = INPUT_PATH |
| | MAX_GOOD = 1500 |
| | MAX_BAD = 15000 |
| | TOP_N = 30 |
| |
|
| | |
| | if INPUT_PATH.exists(): |
| | if not BACKUP_PATH.exists(): |
| | print(f"📦 Tworzę kopię zapasową → {BACKUP_PATH.name}") |
| | shutil.copy(INPUT_PATH, BACKUP_PATH) |
| | else: |
| | print(f"ℹ️ Kopia zapasowa już istnieje: {BACKUP_PATH.name}") |
| |
|
| | |
| | with open(TOP_RULES_PATH, encoding="utf-8") as f: |
| | top_rules = set(json.load(f)[:TOP_N]) |
| | print(f"🏆 Używamy top {TOP_N} reguł") |
| |
|
| | |
| | print("🔍 Wczytywanie danych...") |
| | good_samples = [] |
| | bad_samples = [] |
| |
|
| | with open(INPUT_PATH, encoding="utf-8") as f: |
| | for line in f: |
| | obj = json.loads(line) |
| | if obj["label"] == "good": |
| | good_samples.append(obj) |
| | elif obj["label"] == "bad": |
| | rules = set(obj.get("rules_triggered", [])) |
| | if rules & top_rules: |
| | bad_samples.append(obj) |
| |
|
| | print(f"✅ Good: {len(good_samples)} | ❌ Bad zawierające top {TOP_N} reguły: {len(bad_samples)}") |
| |
|
| | |
| | balanced_good = random.sample(good_samples, min(MAX_GOOD, len(good_samples))) |
| |
|
| | |
| | print("⚙️ Oceniam pliki BAD pod kątem rzadkości reguł...") |
| |
|
| | rule_freq = Counter() |
| | for sample in bad_samples: |
| | rules = sample.get("rules_triggered", []) |
| | rule_freq.update(r for r in rules if r in top_rules) |
| |
|
| | def compute_score(sample): |
| | rules = set(sample.get("rules_triggered", [])) & top_rules |
| | return sum(1 / rule_freq[r] for r in rules if rule_freq[r] > 0) |
| |
|
| | scored_bad = sorted( |
| | bad_samples, |
| | key=lambda s: ( |
| | compute_score(s), |
| | -len(set(s.get("rules_triggered", [])) & top_rules) |
| | ), |
| | reverse=True |
| | ) |
| |
|
| | balanced_bad = scored_bad[:MAX_BAD] |
| |
|
| | |
| | balanced_all = balanced_good + balanced_bad |
| | random.shuffle(balanced_all) |
| |
|
| | OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) |
| | with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out: |
| | for rec in balanced_all: |
| | json.dump(rec, f_out) |
| | f_out.write("\n") |
| |
|
| | print(f"\n✅ Zapisano zbalansowany zbiór (tylko top {TOP_N} reguły): {len(balanced_all)} → {OUTPUT_PATH.name}") |
| | print(f" - Good: {len(balanced_good)}") |
| | print(f" - Bad: {len(balanced_bad)}") |
| |
|