Spaces:

TransformingBerry
/

CDGPT2-Deployment

Sleeping

App Files Files Community

Ziad Meligy commited on Jun 24

Commit

36a5d2b

1 Parent(s): eb8805a

Pushing deployment to space

Browse files

Files changed (5) hide show

requirements.txt +13 -0
tags.py +1 -0
tokenizer_wrapper.py +110 -0
utility.py +64 -0
utils.py +39 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+transformers
+torch
+pandas
+numpy
+matplotlib
+tiktoken
+tensorflow
+gensim
+torchvision
+scikit-image
+pydicom
+fastapi
+python-multipart

tags.py ADDED Viewed

	@@ -0,0 +1 @@

+ tags = ['normal', 'right', 'lung', 'calcified granuloma', 'upper lobe', 'lingula', 'opacity', 'pulmonary atelectasis', 'interstitial', 'bilateral', 'diffuse', 'markings', 'prominent', 'left', 'density', 'retrocardiac', 'metabolic', 'spine', 'calcinosis', 'base', 'bone diseases', 'tortuous', 'indwelling', 'degenerative', 'aorta', 'catheters', 'thoracic vertebrae', 'mild', 'cardiomegaly', 'severe', 'diaphragm', 'elevated', 'hypoinflation', 'pulmonary congestion', 'technical quality of image unsatisfactory', 'chronic', 'pleural effusion', 'consolidation', 'costophrenic angle', 'airspace disease', 'blunted', 'surgical instruments', 'implanted medical device', 'patchy', 'streaky', 'pleura', 'thickening', 'focal', 'cicatrix', 'hilum', 'lower lobe', 'round', 'small', 'hyperdistention', 'mediastinum', 'nodule', 'no indexing', 'posterior', 'obscured', 'scoliosis', 'bronchovascular', 'granulomatous disease', 'multiple', 'osteophyte', 'middle lobe', 'hernia', 'hiatal', 'thoracic', 'pulmonary emphysema', 'lymph nodes', 'atherosclerosis', 'deformity', 'anterior', 'ribs', 'lucency', 'scattered', 'lumbar vertebrae', 'flattened', 'spondylosis', 'bone', 'borderline', 'fractures', 'thorax', 'healed', 'kyphosis', 'chronic obstructive', 'emphysema', 'pulmonary disease', 'infiltrate', 'pulmonary edema', 'moderate', 'enlarged', 'cardiac shadow', 'foreign bodies', 'spinal fusion', 'apex', 'diaphragmatic eventration', 'arthritis', 'pneumonia', 'abdomen', 'large', 'tube', 'inserted', 'paratracheal', 'granuloma']

tokenizer_wrapper.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import pandas as pd
+from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
+from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
+from tensorflow.keras.preprocessing.text import text_to_word_sequence # type: ignore
+import numpy as np
+from transformers import GPT2Tokenizer
+class TokenizerWrapper:
+    def __init__(self, class_name, max_caption_length, tokenizer_num_words=None):
+        # dataset_df = pd.read_csv(dataset_csv_file)
+        # sentences = dataset_df[class_name].tolist()
+        self.max_caption_length = max_caption_length
+        self.tokenizer_num_words = tokenizer_num_words
+        # self.init_tokenizer(sentences)
+        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_prefix_space=True)
+        self.gpt2_tokenizer.pad_token = "<"
+    def clean_sentence(self, sentence):
+        return text_to_word_sequence(sentence, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')
+    def GPT2_pad_token_id(self):
+        return self.gpt2_tokenizer.pad_token_id
+    def GPT2_eos_token_id(self):
+        return self.gpt2_tokenizer.eos_token_id
+    def GPT2_encode(self, sentences, pad=True, max_length=None):
+        if max_length is None:
+            max_length = self.max_caption_length
+        if isinstance(sentences, str):
+            return self.gpt2_tokenizer.encode(sentences, add_special_tokens=True, max_length=max_length,
+                                              pad_to_max_length=pad)
+        tokens = np.zeros((sentences.shape[0], max_length), dtype=int)
+        for i in range(len(sentences)):
+            if pd.isna(sentences[i]):
+                sentences[i][0] = ""
+            sentence = sentences[i][0].lower()
+            sentence = sentence.replace('"', '')
+            sentence = sentence.replace('xxxx', '')
+            sentence = sentence.replace('endseq', '<|endoftext|>')
+            tokens[i] = self.gpt2_tokenizer.encode(sentence, add_special_tokens=True,
+                                                   max_length=max_length, pad_to_max_length=pad)
+        return tokens
+    def GPT2_decode(self, tokens):
+        return self.gpt2_tokenizer.decode(tokens, skip_special_tokens=True)
+    def GPT2_format_output(self, sentence):
+        sentence = self.clean_sentence(sentence)
+        return sentence
+    def filter_special_words(self, sentence):
+        sentence = sentence.replace('startseq', '')
+        sentence = sentence.replace('endseq', '')
+        sentence = sentence.replace('<|endoftext|>', '')
+        sentence = sentence.replace('<', '')
+        sentence = sentence.strip()
+        return sentence
+    def init_tokenizer(self, sentences):
+        for i in range(len(sentences)):
+            if pd.isna(sentences[i]):
+                sentences[i] = ""
+            sentences[i] = self.clean_sentence(sentences[i])
+        self.tokenizer = Tokenizer(oov_token='UNK', num_words=self.tokenizer_num_words)
+        self.tokenizer.fit_on_texts(sentences)  # give each word a unique id
+    def get_tokenizer_num_words(self):
+        return self.tokenizer_num_words
+    def get_token_of_word(self, word):
+        return self.tokenizer.word_index[word]
+    def get_word_from_token(self, token):
+        try:
+            return self.tokenizer.index_word[token]
+        except:
+            return ""
+    def get_sentence_from_tokens(self, tokens):
+        sentence = []
+        for token in tokens[0]:
+            word = self.get_word_from_token(token)
+            if word == 'endseq':
+                return sentence
+            if word != 'startseq':
+                sentence.append(word)
+        return sentence
+    def get_string_from_word_list(self, word_list):
+        return " ".join(word_list)
+    def get_word_tokens_list(self):
+        return self.tokenizer.word_index
+    def tokenize_sentences(self, sentences):
+        index = 0
+        tokenized_sentences = np.zeros((sentences.shape[0], self.max_caption_length), dtype=int)
+        for caption in sentences:
+            tokenized_caption = self.tokenizer.texts_to_sequences([self.clean_sentence(caption[0])])
+            tokenized_sentences[index] = pad_sequences(tokenized_caption, maxlen=self.max_caption_length,
+                                                       padding='post')  # padded with max length
+            index = index + 1
+        return tokenized_sentences

utility.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import numpy as np
+import os
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from generator import AugmentedImageSequence
+import torch.optim as optim
+from tensorflow.keras.models import model_from_json # type: ignore
+def get_dataloader(csv, batch_size, FLAGS, tokenizer_wrapper, augmenter=None):
+    """
+    Replaces the TensorFlow enqueuer with PyTorch DataLoader.
+    """
+    data_generator = AugmentedImageSequence(
+        dataset_csv_file=csv,
+        class_names=FLAGS.csv_label_columns,
+        tokenizer_wrapper=tokenizer_wrapper,
+        source_image_dir=FLAGS.image_directory,
+        batch_size=batch_size,
+        target_size=FLAGS.image_target_size,
+        augmenter=augmenter,
+        shuffle_on_epoch_end=True,
+    )
+    dataloader = DataLoader(data_generator, shuffle=True, num_workers=0)
+    return dataloader, data_generator.steps
+def get_layers(layer_sizes, activation='relu'):
+    """
+    Builds a list of layers in PyTorch based on specified sizes.
+    Dropout layers are specified with values < 1, Dense (Linear) layers otherwise.
+    """
+    layers = []
+    activation_fn = getattr(nn, activation.capitalize(), nn.ReLU)  # Set default activation to ReLU if none specified
+    for layer_size in layer_sizes:
+        if layer_size < 1:
+            layers.append(nn.Dropout(layer_size))
+        else:
+            layers.append(nn.Linear(in_features=layer_size, out_features=layer_size))
+            layers.append(activation_fn())
+    return nn.Sequential(*layers)  # Return as a sequential module for easy stacking
+def get_optimizer(optimizer_type, learning_rate, lr_decay=0):
+    optimizer_class = getattr(optim, optimizer_type)
+    dummy_param = torch.nn.Parameter(torch.empty(0))
+    optimizer = optimizer_class(params = [dummy_param], lr=learning_rate, weight_decay=lr_decay)
+    return optimizer
+def load_model(load_path, model_name):
+    path = os.path.join(load_path, model_name)
+    # load json and create model
+    json_file = open('{}.json'.format(path), 'r')
+    loaded_model_json = json_file.read()
+    json_file.close()
+    loaded_model = model_from_json(loaded_model_json)
+    # # load weights into new model
+    loaded_model.load_weights("{}.h5".format(path))
+    print("Loaded model from disk")
+    return loaded_model

utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from PIL import Image
+import io
+import numpy as np
+import pydicom
+import torch
+from fastapi import HTTPException, UploadFile
+from skimage.transform import resize
+def load_image(image):
+    image = image.convert("RGB")
+    image_array = np.asarray(image) / 255.0  # Normalize to [0,1]
+    image_array = resize(image_array, (224, 224))
+    image_tensor = torch.tensor(image_array, dtype=torch.float32).permute(2, 0, 1)  # CxHxW
+    mean = torch.tensor([0.485, 0.456, 0.406])
+    std = torch.tensor([0.229, 0.224, 0.225])
+    image_tensor = (image_tensor - mean[:, None, None]) / std[:, None, None]
+    return image_tensor.unsqueeze(0)  # Add batch dimension
+async def convert_to_png(file: UploadFile) -> Image.Image:
+    """Converts JPG, PNG, or DICOM to a PNG format"""
+    image_data = await file.read()
+    if file.content_type in ["image/jpeg", "image/png", "image/jpg"]:
+        image = Image.open(io.BytesIO(image_data))
+        return image
+    if file.content_type == "application/dicom" or file.filename.endswith(".dcm") or file.filename.endswith(".dicom"):
+        dicom_data = pydicom.dcmread(io.BytesIO(image_data))
+        pixel_array = dicom_data.pixel_array
+        if pixel_array.dtype != np.uint8:
+            pixel_array = (pixel_array / pixel_array.max() * 255).astype(np.uint8)
+        image = Image.fromarray(pixel_array).convert("RGB")
+        return image
+    raise HTTPException(status_code=400, detail="Unsupported media type")