Upload folder using huggingface_hub
Browse files- README.md +77 -28
- generate_readme.py +193 -0
README.md
CHANGED
|
@@ -9,67 +9,72 @@ tags:
|
|
| 9 |
- indonesia
|
| 10 |
- topic-classification
|
| 11 |
- bert
|
|
|
|
| 12 |
datasets:
|
| 13 |
- custom
|
| 14 |
inference: true
|
| 15 |
model-index:
|
| 16 |
-
- name: BERT Indonesian Topic Classification (
|
| 17 |
results:
|
| 18 |
- task:
|
| 19 |
type: text-classification
|
| 20 |
name: Topic Classification
|
| 21 |
dataset:
|
| 22 |
-
name: Custom Dataset
|
| 23 |
type: custom
|
| 24 |
split: validation
|
| 25 |
metrics:
|
| 26 |
- type: accuracy
|
| 27 |
-
value:
|
| 28 |
- type: f1
|
| 29 |
name: f1_macro
|
| 30 |
-
value:
|
| 31 |
- type: f1
|
| 32 |
name: f1_micro
|
| 33 |
-
value:
|
| 34 |
---
|
| 35 |
|
| 36 |
-
# BERT Indonesian Topic Classification (
|
| 37 |
|
| 38 |
**Base model**: `cahya/bert-base-indonesian-1.5G`
|
| 39 |
**Task**: Topic classification (single-label)
|
| 40 |
-
**Labels (
|
| 41 |
|
| 42 |

|
| 43 |
|
| 44 |
## Intended use
|
| 45 |
|
| 46 |
-
|
| 47 |
|
| 48 |
## Limitations
|
| 49 |
|
| 50 |
-
- Performa bergantung pada distribusi label dataset
|
| 51 |
-
- Teks
|
|
|
|
|
|
|
| 52 |
|
| 53 |
## Training details
|
| 54 |
|
| 55 |
-
- Framework
|
| 56 |
-
-
|
| 57 |
-
-
|
| 58 |
-
-
|
| 59 |
-
-
|
| 60 |
-
-
|
| 61 |
-
-
|
| 62 |
-
-
|
| 63 |
-
-
|
|
|
|
|
|
|
| 64 |
|
| 65 |
## Evaluation
|
| 66 |
|
| 67 |
-
-
|
| 68 |
-
- Accuracy (
|
| 69 |
-
- F1 Macro (
|
| 70 |
-
- F1 Micro (
|
| 71 |
|
| 72 |
-
|
| 73 |
|
| 74 |
## How to use
|
| 75 |
|
|
@@ -77,15 +82,59 @@ Per-label report tersedia pada artifact `eval_results.json`.
|
|
| 77 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 78 |
import torch
|
| 79 |
|
| 80 |
-
|
|
|
|
| 81 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
| 82 |
model = AutoModelForSequenceClassification.from_pretrained(repo_id).eval()
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
| 86 |
with torch.no_grad():
|
| 87 |
logits = model(**inputs).logits
|
|
|
|
|
|
|
| 88 |
pred_id = logits.argmax(-1).item()
|
|
|
|
| 89 |
label = model.config.id2label[pred_id]
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
```
|
|
|
|
| 9 |
- indonesia
|
| 10 |
- topic-classification
|
| 11 |
- bert
|
| 12 |
+
- text-classification
|
| 13 |
datasets:
|
| 14 |
- custom
|
| 15 |
inference: true
|
| 16 |
model-index:
|
| 17 |
+
- name: BERT Indonesian Topic Classification (15 labels)
|
| 18 |
results:
|
| 19 |
- task:
|
| 20 |
type: text-classification
|
| 21 |
name: Topic Classification
|
| 22 |
dataset:
|
| 23 |
+
name: Custom Indonesian Dataset
|
| 24 |
type: custom
|
| 25 |
split: validation
|
| 26 |
metrics:
|
| 27 |
- type: accuracy
|
| 28 |
+
value: 0.92
|
| 29 |
- type: f1
|
| 30 |
name: f1_macro
|
| 31 |
+
value: 0.89
|
| 32 |
- type: f1
|
| 33 |
name: f1_micro
|
| 34 |
+
value: 0.91
|
| 35 |
---
|
| 36 |
|
| 37 |
+
# BERT Indonesian Topic Classification (15 labels)
|
| 38 |
|
| 39 |
**Base model**: `cahya/bert-base-indonesian-1.5G`
|
| 40 |
**Task**: Topic classification (single-label)
|
| 41 |
+
**Labels (15)**: Olahraga, Kecelakaan, Pendidikan, Politik, Judi Online, Teknologi, Kriminalitas, Infrastruktur, Kesehatan, Lalu Lintas, Bencana Alam, Ekonomi, Keuangan, Kemiskinan, Pariwisata
|
| 42 |
|
| 43 |

|
| 44 |
|
| 45 |
## Intended use
|
| 46 |
|
| 47 |
+
Model ini digunakan untuk klasifikasi topik teks berbahasa Indonesia pada 15 kategori utama. Model dapat mengklasifikasikan artikel berita, postingan media sosial, dan dokumen teks lainnya ke dalam kategori yang sesuai seperti politik, ekonomi, olahraga, teknologi, dan sebagainya.
|
| 48 |
|
| 49 |
## Limitations
|
| 50 |
|
| 51 |
+
- Performa model bergantung pada distribusi label dataset yang digunakan
|
| 52 |
+
- Teks di luar domain (OOD) yang tidak serupa dengan data training dapat mengalami penurunan akurasi
|
| 53 |
+
- Model dilatih khusus untuk bahasa Indonesia dan mungkin tidak optimal untuk bahasa lain
|
| 54 |
+
- Performa dapat bervariasi tergantung pada panjang dan kompleksitas teks input
|
| 55 |
|
| 56 |
## Training details
|
| 57 |
|
| 58 |
+
- **Framework**: 🤗 Transformers (PyTorch)
|
| 59 |
+
- **Base model**: cahya/bert-base-indonesian-1.5G
|
| 60 |
+
- **Max length**: 512 tokens
|
| 61 |
+
- **Batch size**: 16
|
| 62 |
+
- **Epochs**: 3
|
| 63 |
+
- **Learning rate**: 2e-5
|
| 64 |
+
- **Weight decay**: 0.01
|
| 65 |
+
- **Warmup ratio**: 0.1
|
| 66 |
+
- **Scheduler**: Linear
|
| 67 |
+
- **Mixed precision**: Enabled
|
| 68 |
+
- **Optimizer**: AdamW
|
| 69 |
|
| 70 |
## Evaluation
|
| 71 |
|
| 72 |
+
- **Data split**: 80/20 stratified split
|
| 73 |
+
- **Accuracy (validation)**: **92.1%**
|
| 74 |
+
- **F1 Macro (validation)**: **89.3%**
|
| 75 |
+
- **F1 Micro (validation)**: **91.2%**
|
| 76 |
|
| 77 |
+
Model menunjukkan performa yang baik dan seimbang di semua kategori. Detail evaluasi per label tersedia dalam file `eval_results.json`.
|
| 78 |
|
| 79 |
## How to use
|
| 80 |
|
|
|
|
| 82 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 83 |
import torch
|
| 84 |
|
| 85 |
+
# Load model dan tokenizer
|
| 86 |
+
repo_id = "your-username/bert-indonesian-topic-classification"
|
| 87 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
| 88 |
model = AutoModelForSequenceClassification.from_pretrained(repo_id).eval()
|
| 89 |
|
| 90 |
+
# Contoh penggunaan
|
| 91 |
+
text = "Pemerintah Indonesia mengumumkan kebijakan ekonomi baru untuk mendorong pertumbuhan UMKM di tengah situasi global yang menantang."
|
| 92 |
+
|
| 93 |
+
# Tokenize dan prediksi
|
| 94 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
| 95 |
with torch.no_grad():
|
| 96 |
logits = model(**inputs).logits
|
| 97 |
+
|
| 98 |
+
# Dapatkan prediksi
|
| 99 |
pred_id = logits.argmax(-1).item()
|
| 100 |
+
confidence = torch.softmax(logits, dim=-1).max().item()
|
| 101 |
label = model.config.id2label[pred_id]
|
| 102 |
+
|
| 103 |
+
print(f"Predicted topic: {label}")
|
| 104 |
+
print(f"Confidence: {confidence:.4f}")
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
## Additional Information
|
| 108 |
+
|
| 109 |
+
### Label Categories
|
| 110 |
+
|
| 111 |
+
Model ini dapat mengklasifikasikan teks ke dalam 15 kategori berikut:
|
| 112 |
+
|
| 113 |
+
1. **Olahraga** - Berita dan informasi seputar dunia olahraga
|
| 114 |
+
2. **Kecelakaan** - Laporan kecelakaan dan insiden
|
| 115 |
+
3. **Pendidikan** - Topik seputar pendidikan dan pembelajaran
|
| 116 |
+
4. **Politik** - Berita politik, pemerintahan, dan kebijakan
|
| 117 |
+
5. **Judi Online** - Konten terkait perjudian online
|
| 118 |
+
6. **Teknologi** - Perkembangan teknologi dan inovasi
|
| 119 |
+
7. **Kriminalitas** - Berita kriminal dan hukum
|
| 120 |
+
8. **Infrastruktur** - Pembangunan dan infrastruktur
|
| 121 |
+
9. **Kesehatan** - Topik kesehatan dan medis
|
| 122 |
+
10. **Lalu Lintas** - Informasi transportasi dan lalu lintas
|
| 123 |
+
11. **Bencana Alam** - Laporan bencana dan cuaca
|
| 124 |
+
12. **Ekonomi** - Berita ekonomi dan bisnis
|
| 125 |
+
13. **Keuangan** - Topik keuangan dan investasi
|
| 126 |
+
14. **Kemiskinan** - Isu sosial dan kemiskinan
|
| 127 |
+
15. **Pariwisata** - Informasi wisata dan travel
|
| 128 |
+
|
| 129 |
+
### Citation
|
| 130 |
+
|
| 131 |
+
Jika menggunakan model ini dalam penelitian atau proyek, mohon cantumkan referensi:
|
| 132 |
+
|
| 133 |
+
```bibtex
|
| 134 |
+
@misc{bert-indonesian-topic-classification-2025,
|
| 135 |
+
title={BERT Indonesian Topic Classification (15 labels)},
|
| 136 |
+
author={Your Name},
|
| 137 |
+
year={2025},
|
| 138 |
+
url={https://huggingface.co/your-username/bert-indonesian-topic-classification}
|
| 139 |
+
}
|
| 140 |
```
|
generate_readme.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script untuk generate README.md dari template
|
| 4 |
+
Penggunaan: python generate_readme.py config.yaml
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
import yaml
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_config(config_path):
|
| 14 |
+
"""Load konfigurasi dari file YAML"""
|
| 15 |
+
with open(config_path, "r", encoding="utf-8") as f:
|
| 16 |
+
return yaml.safe_load(f)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def load_template(template_path):
|
| 20 |
+
"""Load template README"""
|
| 21 |
+
with open(template_path, "r", encoding="utf-8") as f:
|
| 22 |
+
return f.read()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def replace_placeholders(template, config):
|
| 26 |
+
"""Replace placeholder dengan nilai dari config"""
|
| 27 |
+
content = template
|
| 28 |
+
|
| 29 |
+
# Replace semua placeholder dengan nilai dari config
|
| 30 |
+
for key, value in config.items():
|
| 31 |
+
placeholder = f"{{{{{key}}}}}"
|
| 32 |
+
if isinstance(value, (list, dict)):
|
| 33 |
+
# Convert list/dict ke string YAML format
|
| 34 |
+
value = yaml.dump(
|
| 35 |
+
value, default_flow_style=False, allow_unicode=True
|
| 36 |
+
).strip()
|
| 37 |
+
content = content.replace(placeholder, str(value))
|
| 38 |
+
|
| 39 |
+
return content
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def generate_readme(config_path, template_path, output_path):
|
| 43 |
+
"""Generate README dari template dan config"""
|
| 44 |
+
config = load_config(config_path)
|
| 45 |
+
template = load_template(template_path)
|
| 46 |
+
|
| 47 |
+
readme_content = replace_placeholders(template, config)
|
| 48 |
+
|
| 49 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 50 |
+
f.write(readme_content)
|
| 51 |
+
|
| 52 |
+
print(f"README berhasil digenerate: {output_path}")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def create_sample_config(output_path):
|
| 56 |
+
"""Buat sample config file"""
|
| 57 |
+
sample_config = {
|
| 58 |
+
# Metadata
|
| 59 |
+
"LICENSE": "mit",
|
| 60 |
+
"LANGUAGE": "id",
|
| 61 |
+
"LIBRARY_NAME": "transformers",
|
| 62 |
+
"PIPELINE_TAG": "text-classification",
|
| 63 |
+
"DATASET_TYPE": "custom",
|
| 64 |
+
"INFERENCE_ENABLED": True,
|
| 65 |
+
# Model Info
|
| 66 |
+
"MODEL_NAME": "BERT Indonesian Topic Classification (16 labels)",
|
| 67 |
+
"MODEL_TITLE": "BERT Indonesian Topic Classification (16 labels)",
|
| 68 |
+
"BASE_MODEL": "cahya/bert-base-indonesian-1.5G",
|
| 69 |
+
"TASK_TYPE": "text-classification",
|
| 70 |
+
"TASK_NAME": "Topic Classification",
|
| 71 |
+
"TASK_DESCRIPTION": "Topic classification (single-label)",
|
| 72 |
+
"NUM_LABELS": 16,
|
| 73 |
+
"LABELS_INLINE": "Politik, Ekonomi, Olahraga, Teknologi, dll.",
|
| 74 |
+
"DATASET_NAME": "Custom Dataset (ID)",
|
| 75 |
+
"SPLIT_TYPE": "validation",
|
| 76 |
+
# Visualization
|
| 77 |
+
"VISUALIZATION_TYPE": "Confusion Matrix",
|
| 78 |
+
"VISUALIZATION_FILENAME": "confusion_matrix.png",
|
| 79 |
+
# Tags (sebagai list)
|
| 80 |
+
"TAGS": [
|
| 81 |
+
" - indonesian",
|
| 82 |
+
" - indonesia",
|
| 83 |
+
" - topic-classification",
|
| 84 |
+
" - bert",
|
| 85 |
+
],
|
| 86 |
+
# Metrics (sebagai list)
|
| 87 |
+
"METRICS": [
|
| 88 |
+
" - type: accuracy",
|
| 89 |
+
" value: 0.921",
|
| 90 |
+
" - type: f1",
|
| 91 |
+
" name: f1_macro",
|
| 92 |
+
" value: 0.893",
|
| 93 |
+
" - type: f1",
|
| 94 |
+
" name: f1_micro",
|
| 95 |
+
" value: 0.912",
|
| 96 |
+
],
|
| 97 |
+
# Content sections
|
| 98 |
+
"INTENDED_USE": "- Klasifikasi topik untuk teks berbahasa Indonesia pada domain umum.",
|
| 99 |
+
"LIMITATIONS": """- Performa bergantung pada distribusi label dataset Anda.
|
| 100 |
+
- Teks OOD (di luar domain data latih) bisa turun akurasinya.""",
|
| 101 |
+
"TRAINING_DETAILS": """- Framework: 🤗 Transformers (PyTorch)
|
| 102 |
+
- Max length: 512
|
| 103 |
+
- Batch size: 16
|
| 104 |
+
- Epochs: 3
|
| 105 |
+
- Learning rate: 2e-5
|
| 106 |
+
- Weight decay: 0.01
|
| 107 |
+
- Warmup ratio: 0.1
|
| 108 |
+
- Scheduler: linear
|
| 109 |
+
- Mixed precision: true""",
|
| 110 |
+
"EVALUATION_DETAILS": """- Split: 80/20 stratified
|
| 111 |
+
- Accuracy (val): **92.1%**
|
| 112 |
+
- F1 Macro (val): **89.3%**
|
| 113 |
+
- F1 Micro (val): **91.2%**
|
| 114 |
+
|
| 115 |
+
Per-label report tersedia pada artifact `eval_results.json`.""",
|
| 116 |
+
"USAGE_CODE": """from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 117 |
+
import torch
|
| 118 |
+
|
| 119 |
+
repo_id = "your-username/model-name"
|
| 120 |
+
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
| 121 |
+
model = AutoModelForSequenceClassification.from_pretrained(repo_id).eval()
|
| 122 |
+
|
| 123 |
+
text = "Contoh teks untuk diklasifikasi."
|
| 124 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 125 |
+
with torch.no_grad():
|
| 126 |
+
logits = model(**inputs).logits
|
| 127 |
+
pred_id = logits.argmax(-1).item()
|
| 128 |
+
label = model.config.id2label[pred_id]
|
| 129 |
+
print(label)""",
|
| 130 |
+
"ADDITIONAL_INFO": """## Citation
|
| 131 |
+
|
| 132 |
+
Jika menggunakan model ini, mohon kutip:
|
| 133 |
+
```bibtex
|
| 134 |
+
@misc{your-model-2025,
|
| 135 |
+
title={Model Title},
|
| 136 |
+
author={Your Name},
|
| 137 |
+
year={2025},
|
| 138 |
+
url={https://huggingface.co/your-username/model-name}
|
| 139 |
+
}
|
| 140 |
+
```""",
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 144 |
+
yaml.dump(
|
| 145 |
+
sample_config, f, default_flow_style=False, allow_unicode=True, indent=2
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
print(f"Sample config dibuat: {output_path}")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def main():
|
| 152 |
+
parser = argparse.ArgumentParser(description="Generate README dari template")
|
| 153 |
+
parser.add_argument("--config", "-c", help="Path ke file config YAML")
|
| 154 |
+
parser.add_argument(
|
| 155 |
+
"--template",
|
| 156 |
+
"-t",
|
| 157 |
+
default="README.md",
|
| 158 |
+
help="Path ke template README (default: README.md)",
|
| 159 |
+
)
|
| 160 |
+
parser.add_argument(
|
| 161 |
+
"--output",
|
| 162 |
+
"-o",
|
| 163 |
+
default="README_generated.md",
|
| 164 |
+
help="Path output README (default: README_generated.md)",
|
| 165 |
+
)
|
| 166 |
+
parser.add_argument(
|
| 167 |
+
"--create-sample", action="store_true", help="Buat sample config file"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
args = parser.parse_args()
|
| 171 |
+
|
| 172 |
+
if args.create_sample:
|
| 173 |
+
create_sample_config("sample_config.yaml")
|
| 174 |
+
return
|
| 175 |
+
|
| 176 |
+
if not args.config:
|
| 177 |
+
print("Error: --config diperlukan kecuali menggunakan --create-sample")
|
| 178 |
+
parser.print_help()
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
if not Path(args.config).exists():
|
| 182 |
+
print(f"Error: Config file tidak ditemukan: {args.config}")
|
| 183 |
+
return
|
| 184 |
+
|
| 185 |
+
if not Path(args.template).exists():
|
| 186 |
+
print(f"Error: Template file tidak ditemukan: {args.template}")
|
| 187 |
+
return
|
| 188 |
+
|
| 189 |
+
generate_readme(args.config, args.template, args.output)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
if __name__ == "__main__":
|
| 193 |
+
main()
|