BERT-topic-modelling-v1 / generate_readme.py
nahiar's picture
Upload folder using huggingface_hub
e3fab4e verified
#!/usr/bin/env python3
"""
Script untuk generate README.md dari template
Penggunaan: python generate_readme.py config.yaml
"""
import argparse
from pathlib import Path
import yaml
def load_config(config_path):
"""Load konfigurasi dari file YAML"""
with open(config_path, "r", encoding="utf-8") as f:
return yaml.safe_load(f)
def load_template(template_path):
"""Load template README"""
with open(template_path, "r", encoding="utf-8") as f:
return f.read()
def replace_placeholders(template, config):
"""Replace placeholder dengan nilai dari config"""
content = template
# Replace semua placeholder dengan nilai dari config
for key, value in config.items():
placeholder = f"{{{{{key}}}}}"
if isinstance(value, (list, dict)):
# Convert list/dict ke string YAML format
value = yaml.dump(
value, default_flow_style=False, allow_unicode=True
).strip()
content = content.replace(placeholder, str(value))
return content
def generate_readme(config_path, template_path, output_path):
"""Generate README dari template dan config"""
config = load_config(config_path)
template = load_template(template_path)
readme_content = replace_placeholders(template, config)
with open(output_path, "w", encoding="utf-8") as f:
f.write(readme_content)
print(f"README berhasil digenerate: {output_path}")
def create_sample_config(output_path):
"""Buat sample config file"""
sample_config = {
# Metadata
"LICENSE": "mit",
"LANGUAGE": "id",
"LIBRARY_NAME": "transformers",
"PIPELINE_TAG": "text-classification",
"DATASET_TYPE": "custom",
"INFERENCE_ENABLED": True,
# Model Info
"MODEL_NAME": "BERT Indonesian Topic Classification (16 labels)",
"MODEL_TITLE": "BERT Indonesian Topic Classification (16 labels)",
"BASE_MODEL": "cahya/bert-base-indonesian-1.5G",
"TASK_TYPE": "text-classification",
"TASK_NAME": "Topic Classification",
"TASK_DESCRIPTION": "Topic classification (single-label)",
"NUM_LABELS": 16,
"LABELS_INLINE": "Politik, Ekonomi, Olahraga, Teknologi, dll.",
"DATASET_NAME": "Custom Dataset (ID)",
"SPLIT_TYPE": "validation",
# Visualization
"VISUALIZATION_TYPE": "Confusion Matrix",
"VISUALIZATION_FILENAME": "confusion_matrix.png",
# Tags (sebagai list)
"TAGS": [
" - indonesian",
" - indonesia",
" - topic-classification",
" - bert",
],
# Metrics (sebagai list)
"METRICS": [
" - type: accuracy",
" value: 0.921",
" - type: f1",
" name: f1_macro",
" value: 0.893",
" - type: f1",
" name: f1_micro",
" value: 0.912",
],
# Content sections
"INTENDED_USE": "- Klasifikasi topik untuk teks berbahasa Indonesia pada domain umum.",
"LIMITATIONS": """- Performa bergantung pada distribusi label dataset Anda.
- Teks OOD (di luar domain data latih) bisa turun akurasinya.""",
"TRAINING_DETAILS": """- Framework: 🤗 Transformers (PyTorch)
- Max length: 512
- Batch size: 16
- Epochs: 3
- Learning rate: 2e-5
- Weight decay: 0.01
- Warmup ratio: 0.1
- Scheduler: linear
- Mixed precision: true""",
"EVALUATION_DETAILS": """- Split: 80/20 stratified
- Accuracy (val): **92.1%**
- F1 Macro (val): **89.3%**
- F1 Micro (val): **91.2%**
Per-label report tersedia pada artifact `eval_results.json`.""",
"USAGE_CODE": """from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
repo_id = "your-username/model-name"
tokenizer = AutoTokenizer.from_pretrained(repo_id)
model = AutoModelForSequenceClassification.from_pretrained(repo_id).eval()
text = "Contoh teks untuk diklasifikasi."
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
pred_id = logits.argmax(-1).item()
label = model.config.id2label[pred_id]
print(label)""",
"ADDITIONAL_INFO": """## Citation
Jika menggunakan model ini, mohon kutip:
```bibtex
@misc{your-model-2025,
title={Model Title},
author={Your Name},
year={2025},
url={https://huggingface.co/your-username/model-name}
}
```""",
}
with open(output_path, "w", encoding="utf-8") as f:
yaml.dump(
sample_config, f, default_flow_style=False, allow_unicode=True, indent=2
)
print(f"Sample config dibuat: {output_path}")
def main():
parser = argparse.ArgumentParser(description="Generate README dari template")
parser.add_argument("--config", "-c", help="Path ke file config YAML")
parser.add_argument(
"--template",
"-t",
default="README.md",
help="Path ke template README (default: README.md)",
)
parser.add_argument(
"--output",
"-o",
default="README_generated.md",
help="Path output README (default: README_generated.md)",
)
parser.add_argument(
"--create-sample", action="store_true", help="Buat sample config file"
)
args = parser.parse_args()
if args.create_sample:
create_sample_config("sample_config.yaml")
return
if not args.config:
print("Error: --config diperlukan kecuali menggunakan --create-sample")
parser.print_help()
return
if not Path(args.config).exists():
print(f"Error: Config file tidak ditemukan: {args.config}")
return
if not Path(args.template).exists():
print(f"Error: Template file tidak ditemukan: {args.template}")
return
generate_readme(args.config, args.template, args.output)
if __name__ == "__main__":
main()