Upload folder using huggingface_hub
Browse files- .gitattributes +5 -0
- __pycache__/translator.cpython-311.pyc +0 -0
- dataset/dev/dev.en +0 -0
- dataset/dev/dev.vi +0 -0
- dataset/test/test.en +0 -0
- dataset/test/test.vi +0 -0
- dataset/train/train.en +3 -0
- dataset/train/train.vi +3 -0
- dataset_convert/dev/dev.en +0 -0
- dataset_convert/dev/dev.vi +0 -0
- dataset_convert/test/test.en +0 -0
- dataset_convert/test/test.vi +0 -0
- dataset_convert/train/train.en +3 -0
- dataset_convert/train/train.vi +3 -0
- en-vi-translation-transformer-tensorflow.log +5 -0
- main.py +104 -0
- models/__pycache__/decoder.cpython-311.pyc +0 -0
- models/__pycache__/encoder.cpython-311.pyc +0 -0
- models/__pycache__/layers.cpython-311.pyc +0 -0
- models/__pycache__/transformer.cpython-311.pyc +0 -0
- models/__pycache__/utils.cpython-311.pyc +0 -0
- models/decoder.py +53 -0
- models/encoder.py +51 -0
- models/layers.py +123 -0
- models/transformer.py +58 -0
- models/utils.py +23 -0
- requirements.txt +6 -0
- saved_models/backup_weights/latest.weights.h5 +3 -0
- saved_models/backup_weights/training_metadata.json +1 -0
- saved_models/en_vi_translation.keras +3 -0
- tokenizers/en_tokenizer.pkl +3 -0
- tokenizers/vi_tokenizer.pkl +3 -0
- translator.py +44 -0
- utils/__pycache__/preprocessing.cpython-311.pyc +0 -0
- utils/__pycache__/tokenizer_utils.cpython-311.pyc +0 -0
- utils/preprocessing.py +19 -0
- utils/tokenizer_utils.py +14 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
dataset/train/train.en filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
dataset/train/train.vi filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
dataset_convert/train/train.en filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
dataset_convert/train/train.vi filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
saved_models/en_vi_translation.keras filter=lfs diff=lfs merge=lfs -text
|
__pycache__/translator.cpython-311.pyc
ADDED
|
Binary file (3.39 kB). View file
|
|
|
dataset/dev/dev.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset/dev/dev.vi
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset/test/test.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset/test/test.vi
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset/train/train.en
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c26dfeed74b6bf3752f5ca552f2412456f0de153f7c804df8717931fb3a5c78a
|
| 3 |
+
size 13603614
|
dataset/train/train.vi
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:707206edf2dc0280273952c7b70544ea8a1363aa69aaeb9d70514b888dc3067d
|
| 3 |
+
size 18074646
|
dataset_convert/dev/dev.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset_convert/dev/dev.vi
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset_convert/test/test.en
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset_convert/test/test.vi
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset_convert/train/train.en
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c26dfeed74b6bf3752f5ca552f2412456f0de153f7c804df8717931fb3a5c78a
|
| 3 |
+
size 13603614
|
dataset_convert/train/train.vi
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28fe7bf65585138135caa5d35b3426fcd37748a8c392608132decf36ae275d89
|
| 3 |
+
size 19722027
|
en-vi-translation-transformer-tensorflow.log
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[{"stream_name":"stderr","time":5.45447475,"data":"/usr/local/lib/python3.10/dist-packages/traitlets/traitlets.py:2915: FutureWarning: --Exporter.preprocessors=[\"nbconvert.preprocessors.ExtractOutputPreprocessor\"] for containers is deprecated in traitlets 5.0. You can pass `--Exporter.preprocessors item` ... multiple times to add items to a list.\n"}
|
| 2 |
+
,{"stream_name":"stderr","time":5.454559784,"data":" warn(\n"}
|
| 3 |
+
,{"stream_name":"stderr","time":5.528515974,"data":"[NbConvertApp] Converting notebook __notebook__.ipynb to html\n"}
|
| 4 |
+
,{"stream_name":"stderr","time":8.239659837,"data":"[NbConvertApp] Writing 448192 bytes to __results__.html\n"}
|
| 5 |
+
]
|
main.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
from translator import Translator
|
| 3 |
+
from utils import tokenizer_utils
|
| 4 |
+
from utils.preprocessing import input_processing, output_processing
|
| 5 |
+
from models.transformer import Transformer
|
| 6 |
+
from models.encoder import Encoder
|
| 7 |
+
from models.decoder import Decoder
|
| 8 |
+
from models.layers import EncoderLayer, DecoderLayer, MultiHeadAttention, point_wise_feed_forward_network
|
| 9 |
+
from models.utils import masked_loss, masked_accuracy
|
| 10 |
+
import argparse
|
| 11 |
+
|
| 12 |
+
def main(sentences: list, model: tf.keras.Model, en_tokenizer, vi_tokenizer) -> None:
|
| 13 |
+
"""
|
| 14 |
+
Translates input English sentences to Vietnamese using a pre-trained model.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
sentences (list): List of English sentences to translate.
|
| 18 |
+
model (tf.keras.Model): The pre-trained translation model.
|
| 19 |
+
en_tokenizer: English tokenizer.
|
| 20 |
+
vi_tokenizer: Vietnamese tokenizer.
|
| 21 |
+
"""
|
| 22 |
+
# Initialize the translator with tokenizers and the model
|
| 23 |
+
translator = Translator(en_tokenizer, vi_tokenizer, model)
|
| 24 |
+
|
| 25 |
+
# Process and translate each sentence
|
| 26 |
+
for sentence in sentences:
|
| 27 |
+
processed_sentence = input_processing(sentence)
|
| 28 |
+
translated_text = translator(processed_sentence)
|
| 29 |
+
translated_text = output_processing(translated_text)
|
| 30 |
+
|
| 31 |
+
# Display the input and translated text
|
| 32 |
+
print("Input:", processed_sentence)
|
| 33 |
+
print("Translated:", translated_text)
|
| 34 |
+
print("-" * 50)
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
# Set up argument parser
|
| 38 |
+
parser = argparse.ArgumentParser(
|
| 39 |
+
description="Translate English sentences to Vietnamese using a pre-trained transformer model.",
|
| 40 |
+
epilog="Example: python translate.py --sentence 'Hello, world!' --sentence 'The sun is shining.'"
|
| 41 |
+
)
|
| 42 |
+
parser.add_argument(
|
| 43 |
+
"--sentence",
|
| 44 |
+
type=str,
|
| 45 |
+
nargs="*",
|
| 46 |
+
default=[
|
| 47 |
+
(
|
| 48 |
+
"For at least six centuries, residents along a lake in the mountains of central Japan "
|
| 49 |
+
"have marked the depth of winter by celebrating the return of a natural phenomenon "
|
| 50 |
+
"once revered as the trail of a wandering god."
|
| 51 |
+
)
|
| 52 |
+
],
|
| 53 |
+
help="One or more English sentences to translate (default: provided example sentence)"
|
| 54 |
+
)
|
| 55 |
+
parser.add_argument(
|
| 56 |
+
"--model_path",
|
| 57 |
+
type=str,
|
| 58 |
+
default="saved_models/en_vi_translation.keras",
|
| 59 |
+
help="Path to the pre-trained model file (default: saved_models/en_vi_translation.keras)"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Parse arguments
|
| 63 |
+
args = parser.parse_args()
|
| 64 |
+
|
| 65 |
+
# Define custom objects required for loading the model
|
| 66 |
+
custom_objects = {
|
| 67 |
+
"Transformer": Transformer,
|
| 68 |
+
"Encoder": Encoder,
|
| 69 |
+
"Decoder": Decoder,
|
| 70 |
+
"EncoderLayer": EncoderLayer,
|
| 71 |
+
"DecoderLayer": DecoderLayer,
|
| 72 |
+
"MultiHeadAttention": MultiHeadAttention,
|
| 73 |
+
"point_wise_feed_forward_network": point_wise_feed_forward_network,
|
| 74 |
+
"masked_loss": masked_loss,
|
| 75 |
+
"masked_accuracy": masked_accuracy,
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
# Load the pre-trained model once
|
| 79 |
+
print("Loading model from:", args.model_path)
|
| 80 |
+
loaded_model = tf.keras.models.load_model(
|
| 81 |
+
args.model_path, custom_objects=custom_objects
|
| 82 |
+
)
|
| 83 |
+
print("Model loaded successfully.")
|
| 84 |
+
|
| 85 |
+
# Load English and Vietnamese tokenizers once
|
| 86 |
+
en_tokenizer, vi_tokenizer = tokenizer_utils.load_tokenizers()
|
| 87 |
+
|
| 88 |
+
# Run the translation for all provided sentences
|
| 89 |
+
main(sentences=args.sentence, model=loaded_model, en_tokenizer=en_tokenizer, vi_tokenizer=vi_tokenizer)
|
| 90 |
+
|
| 91 |
+
# Interactive loop for additional translations
|
| 92 |
+
while True:
|
| 93 |
+
choice = input("Would you like to translate another sentence? (Y/n): ").strip().lower()
|
| 94 |
+
if choice in ['no', 'n', 'quit', 'q']:
|
| 95 |
+
print("Exiting the program.")
|
| 96 |
+
break
|
| 97 |
+
elif choice in ['yes', 'y']:
|
| 98 |
+
new_sentence = input("Enter an English sentence to translate: ").strip()
|
| 99 |
+
if new_sentence:
|
| 100 |
+
main(sentences=[new_sentence], model=loaded_model, en_tokenizer=en_tokenizer, vi_tokenizer=vi_tokenizer)
|
| 101 |
+
else:
|
| 102 |
+
print("No sentence provided. Please try again.")
|
| 103 |
+
else:
|
| 104 |
+
print("Invalid input. Please enter 'y' or 'n'.")
|
models/__pycache__/decoder.cpython-311.pyc
ADDED
|
Binary file (4.88 kB). View file
|
|
|
models/__pycache__/encoder.cpython-311.pyc
ADDED
|
Binary file (4.82 kB). View file
|
|
|
models/__pycache__/layers.cpython-311.pyc
ADDED
|
Binary file (9.45 kB). View file
|
|
|
models/__pycache__/transformer.cpython-311.pyc
ADDED
|
Binary file (4 kB). View file
|
|
|
models/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (1.76 kB). View file
|
|
|
models/decoder.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
from .layers import DecoderLayer
|
| 3 |
+
|
| 4 |
+
@tf.keras.utils.register_keras_serializable()
|
| 5 |
+
class Decoder(tf.keras.layers.Layer):
|
| 6 |
+
def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
|
| 7 |
+
max_tokens, dropout_rate, **kwargs):
|
| 8 |
+
super(Decoder, self).__init__(**kwargs)
|
| 9 |
+
self.d_model = d_model
|
| 10 |
+
self.num_layers = num_layers
|
| 11 |
+
self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
|
| 12 |
+
self.pos_encoding = self.positional_encoding(max_tokens, d_model)
|
| 13 |
+
self.dec_layers = [DecoderLayer(d_model, num_heads, dff, dropout_rate)
|
| 14 |
+
for _ in range(num_layers)]
|
| 15 |
+
self.dropout = tf.keras.layers.Dropout(dropout_rate)
|
| 16 |
+
|
| 17 |
+
def call(self, x, enc_output, training=None, look_ahead_mask=None, padding_mask=None):
|
| 18 |
+
seq_len = tf.shape(x)[1]
|
| 19 |
+
x = self.embedding(x)
|
| 20 |
+
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
|
| 21 |
+
x += self.pos_encoding[:, :seq_len, :]
|
| 22 |
+
x = self.dropout(x, training=training)
|
| 23 |
+
for i in range(self.num_layers):
|
| 24 |
+
x = self.dec_layers[i](x, enc_output, training=training,
|
| 25 |
+
look_ahead_mask=look_ahead_mask,
|
| 26 |
+
padding_mask=padding_mask)
|
| 27 |
+
return x
|
| 28 |
+
|
| 29 |
+
def positional_encoding(self, max_len, d_model):
|
| 30 |
+
angle_rads = self.get_angles(tf.range(max_len, dtype=tf.float32)[:, tf.newaxis],
|
| 31 |
+
tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
|
| 32 |
+
d_model)
|
| 33 |
+
sines = tf.math.sin(angle_rads[:, 0::2])
|
| 34 |
+
cosines = tf.math.cos(angle_rads[:, 1::2])
|
| 35 |
+
pos_encoding = tf.concat([sines, cosines], axis=-1)
|
| 36 |
+
return pos_encoding[tf.newaxis, ...]
|
| 37 |
+
|
| 38 |
+
def get_angles(self, pos, i, d_model):
|
| 39 |
+
angle_rates = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
|
| 40 |
+
return pos * angle_rates
|
| 41 |
+
|
| 42 |
+
def get_config(self):
|
| 43 |
+
config = super().get_config()
|
| 44 |
+
config.update({
|
| 45 |
+
'num_layers': self.num_layers,
|
| 46 |
+
'd_model': self.d_model,
|
| 47 |
+
'num_heads': self.num_heads,
|
| 48 |
+
'dff': self.dff,
|
| 49 |
+
'target_vocab_size': self.embedding.input_dim,
|
| 50 |
+
'max_tokens': self.pos_encoding.shape[1],
|
| 51 |
+
'dropout_rate': self.dropout.rate
|
| 52 |
+
})
|
| 53 |
+
return config
|
models/encoder.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
from .layers import EncoderLayer
|
| 3 |
+
|
| 4 |
+
@tf.keras.utils.register_keras_serializable()
|
| 5 |
+
class Encoder(tf.keras.layers.Layer):
|
| 6 |
+
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
|
| 7 |
+
max_tokens, dropout_rate, **kwargs):
|
| 8 |
+
super(Encoder, self).__init__(**kwargs)
|
| 9 |
+
self.d_model = d_model
|
| 10 |
+
self.num_layers = num_layers
|
| 11 |
+
self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
|
| 12 |
+
self.pos_encoding = self.positional_encoding(max_tokens, d_model)
|
| 13 |
+
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate)
|
| 14 |
+
for _ in range(num_layers)]
|
| 15 |
+
self.dropout = tf.keras.layers.Dropout(dropout_rate)
|
| 16 |
+
|
| 17 |
+
def call(self, x, training=None, mask=None):
|
| 18 |
+
seq_len = tf.shape(x)[1]
|
| 19 |
+
x = self.embedding(x)
|
| 20 |
+
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
|
| 21 |
+
x += self.pos_encoding[:, :seq_len, :]
|
| 22 |
+
x = self.dropout(x, training=training)
|
| 23 |
+
for i in range(self.num_layers):
|
| 24 |
+
x = self.enc_layers[i](x, training=training, mask=mask)
|
| 25 |
+
return x
|
| 26 |
+
|
| 27 |
+
def positional_encoding(self, max_len, d_model):
|
| 28 |
+
angle_rads = self.get_angles(tf.range(max_len, dtype=tf.float32)[:, tf.newaxis],
|
| 29 |
+
tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
|
| 30 |
+
d_model)
|
| 31 |
+
sines = tf.math.sin(angle_rads[:, 0::2])
|
| 32 |
+
cosines = tf.math.cos(angle_rads[:, 1::2])
|
| 33 |
+
pos_encoding = tf.concat([sines, cosines], axis=-1)
|
| 34 |
+
return pos_encoding[tf.newaxis, ...]
|
| 35 |
+
|
| 36 |
+
def get_angles(self, pos, i, d_model):
|
| 37 |
+
angle_rates = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
|
| 38 |
+
return pos * angle_rates
|
| 39 |
+
|
| 40 |
+
def get_config(self):
|
| 41 |
+
config = super().get_config()
|
| 42 |
+
config.update({
|
| 43 |
+
'num_layers': self.num_layers,
|
| 44 |
+
'd_model': self.d_model,
|
| 45 |
+
'num_heads': self.num_heads,
|
| 46 |
+
'dff': self.dff,
|
| 47 |
+
'input_vocab_size': self.embedding.input_dim,
|
| 48 |
+
'max_tokens': self.pos_encoding.shape[1],
|
| 49 |
+
'dropout_rate': self.dropout.rate
|
| 50 |
+
})
|
| 51 |
+
return config
|
models/layers.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization
|
| 3 |
+
|
| 4 |
+
@tf.keras.utils.register_keras_serializable()
|
| 5 |
+
class EncoderLayer(Layer):
|
| 6 |
+
def __init__(self, d_model, num_heads, dff, dropout_rate, **kwargs):
|
| 7 |
+
super(EncoderLayer, self).__init__(**kwargs)
|
| 8 |
+
self.mha = MultiHeadAttention(d_model, num_heads)
|
| 9 |
+
self.ffn = point_wise_feed_forward_network(d_model, dff)
|
| 10 |
+
self.layernorm1 = LayerNormalization(epsilon=1e-6)
|
| 11 |
+
self.layernorm2 = LayerNormalization(epsilon=1e-6)
|
| 12 |
+
self.dropout1 = Dropout(dropout_rate)
|
| 13 |
+
self.dropout2 = Dropout(dropout_rate)
|
| 14 |
+
|
| 15 |
+
def call(self, x, training=None, mask=None):
|
| 16 |
+
attn_output, _ = self.mha(x, x, x, mask)
|
| 17 |
+
attn_output = self.dropout1(attn_output, training=training)
|
| 18 |
+
out1 = self.layernorm1(x + attn_output)
|
| 19 |
+
ffn_output = self.ffn(out1)
|
| 20 |
+
ffn_output = self.dropout2(ffn_output, training=training)
|
| 21 |
+
out2 = self.layernorm2(out1 + ffn_output)
|
| 22 |
+
return out2
|
| 23 |
+
|
| 24 |
+
def get_config(self):
|
| 25 |
+
config = super().get_config()
|
| 26 |
+
config.update({
|
| 27 |
+
'd_model': self.mha.d_model,
|
| 28 |
+
'num_heads': self.mha.num_heads,
|
| 29 |
+
'dff': self.ffn.layers[0].units,
|
| 30 |
+
'dropout_rate': self.dropout1.rate
|
| 31 |
+
})
|
| 32 |
+
return config
|
| 33 |
+
|
| 34 |
+
@tf.keras.utils.register_keras_serializable()
|
| 35 |
+
class DecoderLayer(Layer):
|
| 36 |
+
def __init__(self, d_model, num_heads, dff, dropout_rate, **kwargs):
|
| 37 |
+
super(DecoderLayer, self).__init__(**kwargs)
|
| 38 |
+
self.mha1 = MultiHeadAttention(d_model, num_heads)
|
| 39 |
+
self.mha2 = MultiHeadAttention(d_model, num_heads)
|
| 40 |
+
self.ffn = point_wise_feed_forward_network(d_model, dff)
|
| 41 |
+
self.layernorm1 = LayerNormalization(epsilon=1e-6)
|
| 42 |
+
self.layernorm2 = LayerNormalization(epsilon=1e-6)
|
| 43 |
+
self.layernorm3 = LayerNormalization(epsilon=1e-6)
|
| 44 |
+
self.dropout1 = Dropout(dropout_rate)
|
| 45 |
+
self.dropout2 = Dropout(dropout_rate)
|
| 46 |
+
self.dropout3 = Dropout(dropout_rate)
|
| 47 |
+
|
| 48 |
+
def call(self, x, enc_output, training=None, look_ahead_mask=None, padding_mask=None):
|
| 49 |
+
attn1, _ = self.mha1(x, x, x, look_ahead_mask)
|
| 50 |
+
attn1 = self.dropout1(attn1, training=training)
|
| 51 |
+
out1 = self.layernorm1(x + attn1)
|
| 52 |
+
attn2, _ = self.mha2(enc_output, enc_output, out1, padding_mask)
|
| 53 |
+
attn2 = self.dropout2(attn2, training=training)
|
| 54 |
+
out2 = self.layernorm2(out1 + attn2)
|
| 55 |
+
ffn_output = self.ffn(out2)
|
| 56 |
+
ffn_output = self.dropout3(ffn_output, training=training)
|
| 57 |
+
out3 = self.layernorm3(out2 + ffn_output)
|
| 58 |
+
return out3
|
| 59 |
+
|
| 60 |
+
def get_config(self):
|
| 61 |
+
config = super().get_config()
|
| 62 |
+
config.update({
|
| 63 |
+
'd_model': self.mha1.d_model,
|
| 64 |
+
'num_heads': self.mha1.num_heads,
|
| 65 |
+
'dff': self.ffn.layers[0].units,
|
| 66 |
+
'dropout_rate': self.dropout1.rate
|
| 67 |
+
})
|
| 68 |
+
return config
|
| 69 |
+
|
| 70 |
+
@tf.keras.utils.register_keras_serializable()
|
| 71 |
+
class MultiHeadAttention(Layer):
|
| 72 |
+
def __init__(self, d_model, num_heads, **kwargs):
|
| 73 |
+
super(MultiHeadAttention, self).__init__(**kwargs)
|
| 74 |
+
self.num_heads = num_heads
|
| 75 |
+
self.d_model = d_model
|
| 76 |
+
assert d_model % num_heads == 0
|
| 77 |
+
self.depth = d_model // num_heads
|
| 78 |
+
self.wq = Dense(d_model)
|
| 79 |
+
self.wk = Dense(d_model)
|
| 80 |
+
self.wv = Dense(d_model)
|
| 81 |
+
self.dense = Dense(d_model)
|
| 82 |
+
|
| 83 |
+
def split_heads(self, x, batch_size):
|
| 84 |
+
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
|
| 85 |
+
return tf.transpose(x, perm=[0, 2, 1, 3])
|
| 86 |
+
|
| 87 |
+
def call(self, v, k, q, mask=None):
|
| 88 |
+
batch_size = tf.shape(q)[0]
|
| 89 |
+
q = self.wq(q)
|
| 90 |
+
k = self.wk(k)
|
| 91 |
+
v = self.wv(v)
|
| 92 |
+
q = self.split_heads(q, batch_size)
|
| 93 |
+
k = self.split_heads(k, batch_size)
|
| 94 |
+
v = self.split_heads(v, batch_size)
|
| 95 |
+
scaled_attention, _ = self.scaled_dot_product_attention(q, k, v, mask)
|
| 96 |
+
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
|
| 97 |
+
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
|
| 98 |
+
output = self.dense(concat_attention)
|
| 99 |
+
return output, _
|
| 100 |
+
|
| 101 |
+
def scaled_dot_product_attention(self, q, k, v, mask):
|
| 102 |
+
matmul_qk = tf.matmul(q, k, transpose_b=True)
|
| 103 |
+
dk = tf.cast(tf.shape(k)[-1], tf.float32)
|
| 104 |
+
scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
|
| 105 |
+
if mask is not None:
|
| 106 |
+
scaled_attention_logits += (mask * -1e9)
|
| 107 |
+
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
|
| 108 |
+
output = tf.matmul(attention_weights, v)
|
| 109 |
+
return output, attention_weights
|
| 110 |
+
|
| 111 |
+
def get_config(self):
|
| 112 |
+
config = super().get_config()
|
| 113 |
+
config.update({
|
| 114 |
+
'd_model': self.d_model,
|
| 115 |
+
'num_heads': self.num_heads
|
| 116 |
+
})
|
| 117 |
+
return config
|
| 118 |
+
|
| 119 |
+
def point_wise_feed_forward_network(d_model, dff):
|
| 120 |
+
return tf.keras.Sequential([
|
| 121 |
+
Dense(dff, activation='relu'),
|
| 122 |
+
Dense(d_model)
|
| 123 |
+
])
|
models/transformer.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
from .encoder import Encoder
|
| 3 |
+
from .decoder import Decoder
|
| 4 |
+
from tensorflow.keras.layers import Dense
|
| 5 |
+
|
| 6 |
+
@tf.keras.utils.register_keras_serializable()
|
| 7 |
+
class Transformer(tf.keras.Model):
|
| 8 |
+
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
|
| 9 |
+
target_vocab_size, max_tokens, dropout_rate=0.1, **kwargs):
|
| 10 |
+
super(Transformer, self).__init__(**kwargs)
|
| 11 |
+
self.num_layers = num_layers
|
| 12 |
+
self.d_model = d_model
|
| 13 |
+
self.num_heads = num_heads
|
| 14 |
+
self.dff = dff
|
| 15 |
+
self.input_vocab_size = input_vocab_size
|
| 16 |
+
self.target_vocab_size = target_vocab_size
|
| 17 |
+
self.max_tokens = max_tokens
|
| 18 |
+
self.dropout_rate = dropout_rate
|
| 19 |
+
|
| 20 |
+
self.encoder = Encoder(num_layers, d_model, num_heads, dff,
|
| 21 |
+
input_vocab_size, max_tokens, dropout_rate)
|
| 22 |
+
self.decoder = Decoder(num_layers, d_model, num_heads, dff,
|
| 23 |
+
target_vocab_size, max_tokens, dropout_rate)
|
| 24 |
+
self.final_layer = Dense(target_vocab_size)
|
| 25 |
+
|
| 26 |
+
def call(self, inputs, training=None):
|
| 27 |
+
enc_input, dec_input = inputs
|
| 28 |
+
enc_padding_mask = self.create_padding_mask(enc_input)
|
| 29 |
+
look_ahead_mask = self.create_look_ahead_mask(tf.shape(dec_input)[1])
|
| 30 |
+
dec_padding_mask = self.create_padding_mask(enc_input)
|
| 31 |
+
enc_output = self.encoder(enc_input, training=training, mask=enc_padding_mask)
|
| 32 |
+
dec_output = self.decoder(dec_input, enc_output, training=training,
|
| 33 |
+
look_ahead_mask=look_ahead_mask,
|
| 34 |
+
padding_mask=dec_padding_mask)
|
| 35 |
+
final_output = self.final_layer(dec_output)
|
| 36 |
+
return final_output
|
| 37 |
+
|
| 38 |
+
def create_padding_mask(self, seq):
|
| 39 |
+
mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
|
| 40 |
+
return mask[:, tf.newaxis, tf.newaxis, :]
|
| 41 |
+
|
| 42 |
+
def create_look_ahead_mask(self, size):
|
| 43 |
+
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
|
| 44 |
+
return mask
|
| 45 |
+
|
| 46 |
+
def get_config(self):
|
| 47 |
+
config = super().get_config()
|
| 48 |
+
config.update({
|
| 49 |
+
'num_layers': self.num_layers,
|
| 50 |
+
'd_model': self.d_model,
|
| 51 |
+
'num_heads': self.num_heads,
|
| 52 |
+
'dff': self.dff,
|
| 53 |
+
'input_vocab_size': self.input_vocab_size,
|
| 54 |
+
'target_vocab_size': self.target_vocab_size,
|
| 55 |
+
'max_tokens': self.max_tokens,
|
| 56 |
+
'dropout_rate': self.dropout_rate
|
| 57 |
+
})
|
| 58 |
+
return config
|
models/utils.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
|
| 3 |
+
@tf.keras.utils.register_keras_serializable()
|
| 4 |
+
def masked_loss(label, pred):
|
| 5 |
+
mask = label != 0
|
| 6 |
+
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
|
| 7 |
+
from_logits=True, reduction='none')
|
| 8 |
+
loss = loss_object(label, pred)
|
| 9 |
+
mask = tf.cast(mask, dtype=loss.dtype)
|
| 10 |
+
loss *= mask
|
| 11 |
+
loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
|
| 12 |
+
return loss
|
| 13 |
+
|
| 14 |
+
@tf.keras.utils.register_keras_serializable()
|
| 15 |
+
def masked_accuracy(label, pred):
|
| 16 |
+
pred = tf.argmax(pred, axis=2)
|
| 17 |
+
label = tf.cast(label, pred.dtype)
|
| 18 |
+
match = label == pred
|
| 19 |
+
mask = label != 0
|
| 20 |
+
match = match & mask
|
| 21 |
+
match = tf.cast(match, dtype=tf.float32)
|
| 22 |
+
mask = tf.cast(mask, dtype=tf.float32)
|
| 23 |
+
return tf.reduce_sum(match)/tf.reduce_sum(mask)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas==2.2.3
|
| 2 |
+
matplotlib==3.7.5
|
| 3 |
+
tensorflow_datasets==4.9.7
|
| 4 |
+
tensorflow==2.17.1
|
| 5 |
+
tensorflow-text==2.17.0
|
| 6 |
+
underthesea==6.8.4
|
saved_models/backup_weights/latest.weights.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4e1e9122a5896689ccb4cb169ff3b2edf9e5474655468fd638b68cb31159eb4
|
| 3 |
+
size 368283712
|
saved_models/backup_weights/training_metadata.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"epoch": 50, "batch": 0}
|
saved_models/en_vi_translation.keras
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85b96fdac99f5d2b1fb6f6823cb6c5691fd2ff339d8a966f0b0cc6da52cc33fd
|
| 3 |
+
size 368250218
|
tokenizers/en_tokenizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d172551a4da648a35629de06593b3051ce32598c3166464de46fc4a1f6f90981
|
| 3 |
+
size 3973568
|
tokenizers/vi_tokenizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42a96a2d1b932a52603083dff973a0eba79998c5e2c2717e54f5c6eeaf84b8fa
|
| 3 |
+
size 5835250
|
translator.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
import numpy as np
|
| 3 |
+
from models.transformer import Transformer
|
| 4 |
+
from models.encoder import Encoder
|
| 5 |
+
from models.decoder import Decoder
|
| 6 |
+
from models.layers import EncoderLayer, DecoderLayer, MultiHeadAttention, point_wise_feed_forward_network
|
| 7 |
+
from models.utils import masked_loss, masked_accuracy
|
| 8 |
+
|
| 9 |
+
class Translator(tf.Module):
|
| 10 |
+
def __init__(self, en_tokenizer, vi_tokenizer, transformer, max_tokens=512):
|
| 11 |
+
self.tokenizers_en = en_tokenizer
|
| 12 |
+
self.tokenizers_vi = vi_tokenizer
|
| 13 |
+
self.transformer = transformer
|
| 14 |
+
self.max_tokens = max_tokens
|
| 15 |
+
|
| 16 |
+
def __call__(self, sentence, max_length=None):
|
| 17 |
+
if max_length is None:
|
| 18 |
+
max_length = self.max_tokens
|
| 19 |
+
|
| 20 |
+
sentence = self.tokenizers_en.texts_to_sequences([sentence])
|
| 21 |
+
sentence = sentence[0] + np.zeros(self.max_tokens - len(sentence[0]),
|
| 22 |
+
dtype=np.int32).tolist()
|
| 23 |
+
sentence = tf.convert_to_tensor([sentence])
|
| 24 |
+
encoder_input = sentence
|
| 25 |
+
|
| 26 |
+
start = self.tokenizers_vi.texts_to_sequences(["<sos>"])[0]
|
| 27 |
+
end = self.tokenizers_vi.texts_to_sequences(["<eos>"])[0]
|
| 28 |
+
|
| 29 |
+
output_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
|
| 30 |
+
output_array = output_array.write(0, start)
|
| 31 |
+
|
| 32 |
+
for i in tf.range(max_length):
|
| 33 |
+
output = tf.transpose(output_array.stack())
|
| 34 |
+
predictions = self.transformer([encoder_input, output], training=False)
|
| 35 |
+
predictions = predictions[:, -1:, :]
|
| 36 |
+
predicted_id = tf.argmax(predictions, axis=-1, output_type=tf.int32)
|
| 37 |
+
output_array = output_array.write(i+1, predicted_id[0])
|
| 38 |
+
|
| 39 |
+
if predicted_id == end:
|
| 40 |
+
break
|
| 41 |
+
|
| 42 |
+
output = tf.transpose(output_array.stack())
|
| 43 |
+
text = self.tokenizers_vi.sequences_to_texts(output.numpy().tolist())[0]
|
| 44 |
+
return text
|
utils/__pycache__/preprocessing.cpython-311.pyc
ADDED
|
Binary file (1.17 kB). View file
|
|
|
utils/__pycache__/tokenizer_utils.cpython-311.pyc
ADDED
|
Binary file (1.28 kB). View file
|
|
|
utils/preprocessing.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import string
|
| 2 |
+
|
| 3 |
+
def input_processing(in_string):
|
| 4 |
+
|
| 5 |
+
punct_marks = string.punctuation
|
| 6 |
+
for mark in punct_marks:
|
| 7 |
+
if mark in in_string:
|
| 8 |
+
in_string = in_string.replace(mark, " " + mark)
|
| 9 |
+
in_string = in_string.replace("scholl", "school")
|
| 10 |
+
in_string = in_string.strip()
|
| 11 |
+
|
| 12 |
+
return in_string
|
| 13 |
+
|
| 14 |
+
def output_processing(in_string):
|
| 15 |
+
|
| 16 |
+
in_string = in_string.replace("_", " ").replace('<sos>', '').replace('<eos>', '').strip()
|
| 17 |
+
in_string = in_string[0].upper() + in_string[1:]
|
| 18 |
+
|
| 19 |
+
return in_string
|
utils/tokenizer_utils.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import tensorflow as tf
|
| 3 |
+
|
| 4 |
+
def load_tokenizers(en_path='tokenizers/en_tokenizer.pkl',
|
| 5 |
+
vi_path='tokenizers/vi_tokenizer.pkl'):
|
| 6 |
+
with open(en_path, 'rb') as f:
|
| 7 |
+
en_tokenizer = pickle.load(f)
|
| 8 |
+
with open(vi_path, 'rb') as f:
|
| 9 |
+
vi_tokenizer = pickle.load(f)
|
| 10 |
+
|
| 11 |
+
en_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(en_tokenizer)
|
| 12 |
+
vi_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(vi_tokenizer)
|
| 13 |
+
|
| 14 |
+
return en_tokenizer, vi_tokenizer
|