IlPakoZ commited on
Commit
79111ac
ยท
verified ยท
1 Parent(s): 7d1331e

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +103 -60
  2. readme.md +191 -0
  3. readme_spaces.md +169 -0
app.py CHANGED
@@ -18,7 +18,7 @@ from PIL import Image, ImageDraw, ImageFont
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
- def create_placeholder_image(width=400, height=300, text="No visualization available", bg_color=(0, 0, 0, 0)):
22
  """
23
  Create a transparent placeholder image with text
24
 
@@ -231,7 +231,6 @@ class DrugTargetInteractionApp:
231
  logger.info(f"Target attention mask shape: {target_inputs['attention_mask'].shape}")
232
  logger.info(f"Drug attention mask shape: {drug_inputs['attention_mask'].shape}")
233
 
234
-
235
  cross_attention_img = plot_crossattention_weights(
236
  target_inputs["attention_mask"][0],
237
  drug_inputs["attention_mask"][0],
@@ -389,10 +388,7 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
389
  lines=2
390
  )
391
 
392
- # Buttons side by side
393
- with gr.Row():
394
- predict_btn = gr.Button("๐Ÿš€ Predict Interaction", variant="primary", size="lg")
395
- visualize_btn = gr.Button("๐Ÿ“Š Visualize Interaction", variant="secondary", size="lg")
396
 
397
  with gr.Column(scale=1):
398
  prediction_output = gr.Textbox(
@@ -401,46 +397,6 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
401
  lines=3
402
  )
403
 
404
- # Visualization outputs section
405
- gr.HTML("<h3 style='margin-top: 30px; color: #2E86AB;'>๐Ÿ“ˆ Interaction Visualizations</h3>")
406
-
407
- with gr.Row():
408
- with gr.Column():
409
- viz_image1 = gr.Image(
410
- label="Cross-Attention Heatmap",
411
- type="pil",
412
- interactive=False,
413
- container=True,
414
- height=300,
415
- value=create_placeholder_image(text="Cross-Attention Heatmap\n(Click Visualize to generate)")
416
- )
417
-
418
- with gr.Column():
419
- viz_image2 = gr.Image(
420
- label="Raw pKd Contribution Visualization",
421
- type="pil",
422
- interactive=False,
423
- container=True,
424
- height=300,
425
- value=create_placeholder_image(text="Raw pKd Contribution\n(Click Visualize to generate)")
426
- )
427
-
428
- with gr.Column():
429
- viz_image3 = gr.Image(
430
- label="Normalized pKd Contribution Visualization",
431
- type="pil",
432
- interactive=False,
433
- container=True,
434
- height=300,
435
- value=create_placeholder_image(text="Normalized pKd Contribution\n(Click Visualize to generate)")
436
- )
437
-
438
- viz_status = gr.Textbox(
439
- label="Visualization Status",
440
- interactive=False,
441
- lines=2
442
- )
443
-
444
  # Example inputs
445
  gr.HTML("<h3 style='margin-top: 20px; color: #2E86AB;'>๐Ÿ“š Example Inputs:</h3>")
446
 
@@ -461,18 +417,100 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
461
  cache_examples=False
462
  )
463
 
464
- # Button click events
465
  predict_btn.click(
466
  fn=predict_wrapper,
467
  inputs=[target_input, drug_input],
468
  outputs=prediction_output
469
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  visualize_btn.click(
472
  fn=visualize_wrapper,
473
- inputs=[target_input, drug_input],
474
  outputs=[viz_image1, viz_image2, viz_image3, viz_status]
475
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
 
477
  with gr.Tab("โš™๏ธ Model Settings"):
478
  gr.HTML("<h3 style='color: #2E86AB;'>Model Configuration</h3>")
@@ -483,7 +521,7 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
483
  placeholder="Path to model directory"
484
  )
485
 
486
- load_model_btn = gr.Button("๐Ÿ”ฅ Load Model", variant="secondary")
487
  model_status = gr.Textbox(
488
  label="Status",
489
  interactive=False,
@@ -502,13 +540,13 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
502
 
503
  This application uses a deep learning model for predicting drug-target interactions. The model architecture includes:
504
 
505
- - **Target Encoder**: Processes RNA sequences
506
- - **Drug Encoder**: Processes molecular SMILES notation
507
  - **Cross-Attention Mechanism**: Captures interactions between drugs and targets
508
- - **Regression Head**: Predicts binding affinity scores
509
 
510
  ### Input Requirements:
511
- - **Target Sequence**: RNA sequence of the target
512
  - **Drug SMILES**: Simplified Molecular Input Line Entry System notation
513
 
514
  ### Model Features:
@@ -519,16 +557,21 @@ with gr.Blocks(title="Drug-Target Interaction Predictor", theme=gr.themes.Soft()
519
 
520
  ### Usage Tips:
521
  1. Load your trained model using the Model Settings tab
522
- 2. Enter a RNA sequence and drug SMILES
523
- 3. Click "Predict Interaction" to get binding affinity prediction
524
- 4. Click "Visualize Interaction" to see detailed interaction analysis
525
 
526
- For best results, ensure your input sequences are properly formatted and within reasonable length limits.
527
 
528
  ### Visualization Features:
529
- - **Cross-Attention Heatmap**: Shows cross-attention between drug and target tokens
530
- - **Raw pKd Contribution**: Shows raw signed contributions (only when pKd > 0)
531
- - **Normalized pKd Contribution**: Shows normalized non-negative contributions
 
 
 
 
 
532
  """)
533
 
534
  # Launch the app
 
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
+ def create_placeholder_image(width=600, height=400, text="No visualization available", bg_color=(0, 0, 0, 0)):
22
  """
23
  Create a transparent placeholder image with text
24
 
 
231
  logger.info(f"Target attention mask shape: {target_inputs['attention_mask'].shape}")
232
  logger.info(f"Drug attention mask shape: {drug_inputs['attention_mask'].shape}")
233
 
 
234
  cross_attention_img = plot_crossattention_weights(
235
  target_inputs["attention_mask"][0],
236
  drug_inputs["attention_mask"][0],
 
388
  lines=2
389
  )
390
 
391
+ predict_btn = gr.Button("๐Ÿš€ Predict Interaction", variant="primary", size="lg")
 
 
 
392
 
393
  with gr.Column(scale=1):
394
  prediction_output = gr.Textbox(
 
397
  lines=3
398
  )
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  # Example inputs
401
  gr.HTML("<h3 style='margin-top: 20px; color: #2E86AB;'>๐Ÿ“š Example Inputs:</h3>")
402
 
 
417
  cache_examples=False
418
  )
419
 
420
+ # Button click event
421
  predict_btn.click(
422
  fn=predict_wrapper,
423
  inputs=[target_input, drug_input],
424
  outputs=prediction_output
425
  )
426
+
427
+ with gr.Tab("๐Ÿ“Š Visualizations"):
428
+ gr.HTML("""
429
+ <div style="text-align: center; margin-bottom: 20px;">
430
+ <h2 style="color: #2E86AB;">๐Ÿ”ฌ Interaction Analysis & Visualizations</h2>
431
+ <p style="font-size: 1.1em; color: #666;">
432
+ Generate detailed visualizations to understand drug-target interactions
433
+ </p>
434
+ </div>
435
+ """)
436
+
437
+ with gr.Row():
438
+ with gr.Column(scale=1):
439
+ viz_target_input = gr.Textbox(
440
+ label="Target RNA Sequence",
441
+ placeholder="Enter RNA sequence (e.g., AUGCUAGCUAGUACGUA...)",
442
+ lines=4,
443
+ max_lines=6
444
+ )
445
+
446
+ viz_drug_input = gr.Textbox(
447
+ label="Drug SMILES",
448
+ placeholder="Enter SMILES notation (e.g., CC(C)CC1=CC=C(C=C1)C(C)C(=O)O)",
449
+ lines=2
450
+ )
451
+
452
+ visualize_btn = gr.Button("๐Ÿ“Š Generate Visualizations", variant="primary", size="lg")
453
+
454
+ viz_status = gr.Textbox(
455
+ label="Visualization Status",
456
+ interactive=False,
457
+ lines=3
458
+ )
459
+
460
+ # Visualization outputs - Large and vertically aligned
461
+ gr.HTML("<div style='margin-top: 30px;'></div>")
462
+
463
+ viz_image1 = gr.Image(
464
+ label="Cross-Attention Heatmap",
465
+ type="pil",
466
+ interactive=False,
467
+ container=True,
468
+ height=500,
469
+ value=create_placeholder_image(text="Cross-Attention Heatmap\n(Click Generate Visualizations to create)")
470
+ )
471
 
472
+ viz_image2 = gr.Image(
473
+ label="Raw pKd Contribution Visualization",
474
+ type="pil",
475
+ interactive=False,
476
+ container=True,
477
+ height=500,
478
+ value=create_placeholder_image(text="Raw pKd Contribution\n(Click Generate Visualizations to create)")
479
+ )
480
+
481
+ viz_image3 = gr.Image(
482
+ label="Normalized pKd Contribution Visualization",
483
+ type="pil",
484
+ interactive=False,
485
+ container=True,
486
+ height=500,
487
+ value=create_placeholder_image(text="Normalized pKd Contribution\n(Click Generate Visualizations to create)")
488
+ )
489
+
490
+ # Button click event for visualizations
491
  visualize_btn.click(
492
  fn=visualize_wrapper,
493
+ inputs=[viz_target_input, viz_drug_input],
494
  outputs=[viz_image1, viz_image2, viz_image3, viz_status]
495
  )
496
+
497
+ # Example inputs for visualization tab
498
+ gr.HTML("<h3 style='margin-top: 20px; color: #2E86AB;'>๐Ÿ“š Example Inputs:</h3>")
499
+
500
+ viz_examples = gr.Examples(
501
+ examples=[
502
+ [
503
+ "AUGCUAGCUAGUACGUAUAUCUGCACUGC",
504
+ "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"
505
+ ],
506
+ [
507
+ "AUGCGAUCGACGUACGUUAGCCGUAGCGUAGCUAGUGUAGCUAGUAGCU",
508
+ "C1=CC=C(C=C1)NC(=O)C2=CC=CC=N2"
509
+ ]
510
+ ],
511
+ inputs=[viz_target_input, viz_drug_input],
512
+ cache_examples=False
513
+ )
514
 
515
  with gr.Tab("โš™๏ธ Model Settings"):
516
  gr.HTML("<h3 style='color: #2E86AB;'>Model Configuration</h3>")
 
521
  placeholder="Path to model directory"
522
  )
523
 
524
+ load_model_btn = gr.Button("๐Ÿ“ฅ Load Model", variant="secondary")
525
  model_status = gr.Textbox(
526
  label="Status",
527
  interactive=False,
 
540
 
541
  This application uses a deep learning model for predicting drug-target interactions. The model architecture includes:
542
 
543
+ - **Target Encoder**: Processes RNA sequences using RNA-BERTa
544
+ - **Drug Encoder**: Processes molecular SMILES notation using ChemBERTa
545
  - **Cross-Attention Mechanism**: Captures interactions between drugs and targets
546
+ - **Regression Head**: Predicts binding affinity scores (pKd values)
547
 
548
  ### Input Requirements:
549
+ - **Target Sequence**: RNA sequence of the target (nucleotide sequences: A, U, G, C)
550
  - **Drug SMILES**: Simplified Molecular Input Line Entry System notation
551
 
552
  ### Model Features:
 
557
 
558
  ### Usage Tips:
559
  1. Load your trained model using the Model Settings tab
560
+ 2. Enter a RNA sequence and drug SMILES in either the Prediction or Visualization tab
561
+ 3. Click "Predict Interaction" for binding affinity prediction only
562
+ 4. Click "Generate Visualizations" for detailed interaction analysis with visual interpretations
563
 
564
+ For best results, ensure your input sequences are properly formatted and within reasonable length limits (max 512 tokens).
565
 
566
  ### Visualization Features:
567
+ - **Cross-Attention Heatmap**: Shows cross-attention weights between drug and target tokens
568
+ - **Raw pKd Contribution**: Shows raw signed contributions from each target token (only when pKd > 0)
569
+ - **Normalized pKd Contribution**: Shows normalized non-negative contributions from each target token
570
+
571
+ ### Performance Metrics:
572
+ - Training on diverse drug-target interaction datasets
573
+ - Evaluated using RMSE, Pearson correlation, and Concordance Index
574
+ - Optimized for both predictive accuracy and interpretability
575
  """)
576
 
577
  # Launch the app
readme.md ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Drug-Target Interaction Prediction Model
2
+
3
+ ## Model Description
4
+
5
+ This model predicts drug-target interactions using a novel cross-attention architecture that combines RNA sequence understanding with molecular representation learning. The model processes RNA target sequences and drug SMILES representations to predict binding affinity scores (pKd values).
6
+
7
+ ## Architecture
8
+
9
+ The model consists of several key components:
10
+
11
+ 1. **Target Encoder**: RNA-BERTa model that processes RNA sequences (nucleotides A, U, G, C)
12
+ 2. **Drug Encoder**: ChemBERTa-77M-MTR model [1] that processes molecular SMILES representations
13
+ 3. **Cross-Attention Layer**: Single-head attention mechanism (1 head) that models interactions between drug and target representations
14
+ 4. **Regression Head**: Predicts binding affinity scores with learnable scaling and bias parameters
15
+
16
+ ### Technical Specifications
17
+
18
+ - **Model Size**: Combines RNA-BERTa (target encoder) + ChemBERTa-77M-MTR (drug encoder)
19
+ - **Cross-Attention**: Single-head attention with 384-dimensional embeddings
20
+ - **Maximum Sequence Length**: 512 tokens for both target and drug inputs
21
+ - **Output**: Continuous binding affinity prediction (pKd values)
22
+ - **Dropout**: Configurable attention dropout and hidden dropout for regularization
23
+ - **Layer Normalization**: Applied for training stability
24
+
25
+ ## Performance Metrics
26
+
27
+ Evaluated on external ROBIN test datasets [2] across different RNA classes:
28
+
29
+ | Dataset | Precision | Specificity | Recall | AUROC | F1 Score |
30
+ |---------|-----------|-------------|---------|-------|----------|
31
+ | Aptamers | 0.648 | 0.002 | 1.000 | 0.571 | 0.787 |
32
+ | Riboswitch | 0.519 | 0.035 | 0.972 | 0.577 | 0.677 |
33
+ | Viral RNA | 0.562 | 0.095 | 0.943 | 0.579 | 0.704 |
34
+ | miRNA | 0.373 | 0.028 | 0.991 | 0.596 | 0.542 |
35
+
36
+ ## Usage
37
+
38
+ ### Using the Gradio Interface
39
+
40
+ ```python
41
+ import gradio as gr
42
+ from updated_app import demo
43
+
44
+ # Launch the interactive interface
45
+ demo.launch()
46
+ ```
47
+
48
+ ### Programmatic Usage
49
+
50
+ ```python
51
+ from modeling_dlmberta import InteractionModelATTNForRegression, StdScaler
52
+ from configuration_dlmberta import InteractionModelATTNConfig
53
+ from transformers import AutoModel, RobertaModel, AutoConfig
54
+ from chemberta import ChembertaTokenizer
55
+
56
+ # Load model components
57
+ config = InteractionModelATTNConfig.from_pretrained("path/to/model")
58
+
59
+ # Load encoders
60
+ target_encoder = AutoModel.from_pretrained("IlPakoZ/RNA-BERTa9700")
61
+ drug_encoder_config = AutoConfig.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
62
+ drug_encoder_config.pooler = None
63
+ drug_encoder = RobertaModel(config=drug_encoder_config, add_pooling_layer=False)
64
+
65
+ # Load scaler (if available)
66
+ scaler = StdScaler()
67
+ scaler.load("path/to/model")
68
+
69
+ # Initialize model
70
+ model = InteractionModelATTNForRegression.from_pretrained(
71
+ "path/to/model",
72
+ config=config,
73
+ target_encoder=target_encoder,
74
+ drug_encoder=drug_encoder,
75
+ scaler=scaler
76
+ )
77
+
78
+ # Make predictions
79
+ target_sequence = "AUGCGAUCGACGUACGUUAGCCGUAGCGUAGCUAGUGUAGCUAGUAGCU"
80
+ drug_smiles = "C1=CC=C(C=C1)NC(=O)C2=CC=CC=N2"
81
+
82
+ # Tokenize inputs
83
+ target_inputs = target_tokenizer(target_sequence, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
84
+ drug_inputs = drug_tokenizer(drug_smiles, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
85
+
86
+ # Predict
87
+ with torch.no_grad():
88
+ prediction = model(target_inputs, drug_inputs)
89
+ if model.scaler:
90
+ prediction = model.unscale(prediction)
91
+ ```
92
+
93
+ ## Model Inputs
94
+
95
+ - **Target Sequence**: RNA sequence using nucleotides A, U, G, C (string)
96
+ - **Drug SMILES**: Simplified Molecular Input Line Entry System notation (string)
97
+
98
+ ## Model Outputs
99
+
100
+ - **Binding Affinity**: Predicted pKd binding affinity score (float)
101
+ - **Attention Weights**: Cross-attention weights for interpretability analysis (when enabled)
102
+
103
+ ## Interpretability Features
104
+
105
+ The model includes advanced interpretability capabilities:
106
+
107
+ - **Cross-Attention Visualization**: Heatmaps showing interaction patterns between drug and target tokens
108
+ - **Token-Level Contributions**: Visualization of individual token contributions to the final prediction
109
+ - **Raw vs. Normalized Contributions**: Both scaled and unscaled contribution analysis
110
+ - **Interpretation Mode**: Special mode for extracting attention weights and intermediate values
111
+
112
+ ### Enabling Interpretation Mode
113
+
114
+ ```python
115
+ # Enable interpretation mode (evaluation only)
116
+ model.INTERPR_ENABLE_MODE()
117
+
118
+ # Make prediction with interpretation data
119
+ prediction = model(target_inputs, drug_inputs)
120
+
121
+ # Access interpretation data
122
+ cross_attention_weights = model.model.crossattention_weights
123
+ presum_contributions = model.model.presum_layer
124
+ attention_scores = model.model.scores
125
+
126
+ # Disable interpretation mode
127
+ model.INTERPR_DISABLE_MODE()
128
+ ```
129
+
130
+ ## Training Details
131
+
132
+ ### Data Processing
133
+ - **Scaling**: Uses StdScaler for target value normalization
134
+ - **Tokenization**: Separate tokenizers for RNA sequences and SMILES strings
135
+ - **Padding**: Max length padding to 512 tokens
136
+ - **Masking**: Attention masks to handle variable-length sequences
137
+
138
+ ### Architecture Details
139
+ - **Embedding Dimension**: 384 for cross-attention layer
140
+ - **Target Encoder Output**: 512 dimensions, mapped to 384
141
+ - **Drug Encoder Output**: 384 dimensions (direct use)
142
+ - **Attention Mechanism**: Single-head cross-attention with scaled dot-product
143
+ - **Learnable Parameters**: Weighted sum with learnable scaling vector and bias
144
+ - **Padding Handling**: Learnable padding value for masked positions
145
+
146
+ ## Limitations
147
+
148
+ - Performance varies significantly across RNA classes (miRNA shows lower precision)
149
+ - May not generalize well to RNA sequences or chemical scaffolds not represented in training data
150
+ - Computational requirements scale with sequence length (max 512 tokens)
151
+ - Single attention head may limit capacity to capture diverse interaction patterns
152
+ - SMILES representation may not capture all relevant molecular properties
153
+
154
+ ## Files in this Repository
155
+
156
+ - `modeling_dlmberta.py`: Main model implementation with cross-attention architecture
157
+ - `configuration_dlmberta.py`: Model configuration class
158
+ - `chemberta.py`: Custom tokenizer for chemical SMILES processing
159
+ - `updated_app.py`: Gradio application interface with visualization capabilities
160
+ - `analysis.py`: Visualization functions for interpretability
161
+ - `requirements.txt`: Python dependencies
162
+ - `config.json`: Model configuration file
163
+
164
+ ## License
165
+
166
+ This model is released under the MIT License.
167
+
168
+ ### Citations
169
+ [1]
170
+ ```bibtex
171
+ @article{ahmad2022chemberta,
172
+ title={Chemberta-2: Towards chemical foundation models},
173
+ author={Ahmad, Walid and Simon, Elana and Chithrananda, Seyone and Grand, Gabriel and Ramsundar, Bharath},
174
+ journal={arXiv preprint arXiv:2209.01712},
175
+ year={2022}
176
+ }
177
+ ```
178
+
179
+ [2]
180
+ ```bibtex
181
+ @article{krishnan2024reliable,
182
+ title={Reliable method for predicting the binding affinity of RNA-small molecule interactions using machine learning},
183
+ author={Krishnan, Sowmya R and Roy, Arijit and Gromiha, M Michael},
184
+ journal={Briefings in Bioinformatics},
185
+ volume={25},
186
+ number={2},
187
+ pages={bbae002},
188
+ year={2024},
189
+ publisher={Oxford University Press}
190
+ }
191
+ ```
readme_spaces.md ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Drug-Target Interaction Predictor
3
+ emoji: ๐Ÿงฌ
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: updated_app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # Drug-Target Interaction Predictor
14
+
15
+ An interactive deep learning application for predicting drug-target interactions using a novel cross-attention architecture. This model processes RNA sequences and drug SMILES representations to predict binding affinity scores (pKd values) with interpretability features.
16
+
17
+ ## Features
18
+
19
+ - ๐Ÿ”ฎ **Binding Affinity Prediction**: Input RNA sequences and drug SMILES to get quantitative binding affinity predictions
20
+ - ๐Ÿ“Š **Interactive Visualizations**: Generate cross-attention heatmaps and contribution analysis plots
21
+ - ๐Ÿงฌ **RNA-Drug Interaction Analysis**: Understand how different tokens contribute to binding predictions
22
+ - โš™๏ธ **Model Management**: Load and configure different model checkpoints
23
+ - ๐ŸŽฏ **Interpretability Tools**: Visualize attention weights and token-level contributions
24
+ - ๐Ÿ“ˆ **Performance Metrics**: Evaluated on multiple RNA classes (Aptamers, Riboswitches, Viral RNA, miRNA)
25
+
26
+ ## How to Use
27
+
28
+ ### 1. Prediction Tab
29
+ - **Load Model**: The model loads automatically on startup (if available in the current directory)
30
+ - **Enter Inputs**:
31
+ - Target RNA sequence (nucleotides: A, U, G, C)
32
+ - Drug SMILES string (molecular representation)
33
+ - **Get Results**: Click "Predict Interaction" to receive binding affinity prediction (pKd value)
34
+
35
+ ### 2. Visualizations Tab
36
+ - **Generate Analysis**: Use the same inputs to create detailed visualizations
37
+ - **Cross-Attention Heatmap**: Shows interaction patterns between drug and target tokens
38
+ - **Raw pKd Contribution**: Displays signed contributions from each target token (only when pKd > 0)
39
+ - **Normalized pKd Contribution**: Shows normalized contributions for all predictions
40
+
41
+ ### 3. Model Settings Tab
42
+ - **Custom Models**: Load your own trained models by specifying the model directory path
43
+ - **Status Monitoring**: Check model loading status and configuration
44
+
45
+ ## Model Architecture
46
+
47
+ The model combines state-of-the-art language models with cross-attention mechanisms:
48
+
49
+ - **Target Encoder**: RNA-BERTa model for processing RNA sequences
50
+ - **Drug Encoder**: ChemBERTa-77M-MTR model [1] for molecular SMILES processing
51
+ - **Cross-Attention**: Single-head attention mechanism (384-dimensional embeddings)
52
+ - **Regression Head**: Learnable weighted sum with scaling and bias parameters
53
+ - **Interpretability**: Built-in interpretation mode for attention analysis
54
+
55
+ ## Performance on ROBIN Test Datasets
56
+
57
+ Evaluated on external ROBIN test datasets [2] across different RNA classes:
58
+
59
+ | RNA Class | Precision | Specificity | Recall | AUROC | F1 Score |
60
+ |-----------|-----------|-------------|---------|-------|----------|
61
+ | Aptamers | 0.648 | 0.002 | 1.000 | 0.571 | 0.787 |
62
+ | Riboswitch | 0.519 | 0.035 | 0.972 | 0.577 | 0.677 |
63
+ | Viral RNA | 0.562 | 0.095 | 0.943 | 0.579 | 0.704 |
64
+ | miRNA | 0.373 | 0.028 | 0.991 | 0.596 | 0.542 |
65
+
66
+ ## Example Usage
67
+
68
+ Try these example inputs to see the model in action:
69
+
70
+ **Example 1:**
71
+ - **Target**: `AUGCUAGCUAGUACGUAUAUCUGCACUGC`
72
+ - **Drug**: `CC(C)CC1=CC=C(C=C1)C(C)C(=O)O`
73
+
74
+ **Example 2:**
75
+ - **Target**: `AUGCGAUCGACGUACGUUAGCCGUAGCGUAGCUAGUGUAGCUAGUAGCU`
76
+ - **Drug**: `C1=CC=C(C=C1)NC(=O)C2=CC=CC=N2`
77
+
78
+ ## Input Format Requirements
79
+
80
+ - **Target Sequence**:
81
+ - RNA sequences using nucleotides A, U, G, C
82
+ - Maximum length: 512 tokens
83
+ - Automatically truncated/padded as needed
84
+
85
+ - **Drug SMILES**:
86
+ - Standard SMILES notation for molecular structures
87
+ - Maximum length: 512 tokens
88
+ - Example: `CC(C)CC1=CC=C(C=C1)C(C)C(=O)O` (Ibuprofen)
89
+
90
+ ## Technical Specifications
91
+
92
+ - **Model Size**: RNA-BERTa + ChemBERTa-77M-MTR backbone
93
+ - **Attention Heads**: 1 (single-head cross-attention)
94
+ - **Embedding Dimension**: 384 for cross-attention layer
95
+ - **Maximum Sequence Length**: 512 tokens for both inputs
96
+ - **Output Range**: Continuous pKd values (can be negative)
97
+ - **Scaling**: Built-in StdScaler for target value normalization
98
+
99
+ ## Visualization Features
100
+
101
+ ### Cross-Attention Heatmap
102
+ - Displays attention weights between drug and target tokens
103
+ - Helps identify which molecular features interact with specific RNA regions
104
+ - Color intensity represents attention strength
105
+
106
+ ### Contribution Analysis
107
+ - **Raw Contributions**: Signed values showing positive/negative token impacts (only for pKd > 0)
108
+ - **Normalized Contributions**: Non-negative values showing relative token importance
109
+ - Token-level breakdown of final prediction components
110
+
111
+ ## Limitations & Considerations
112
+
113
+ - **RNA Class Variation**: Performance differs across RNA types (miRNA shows lower precision)
114
+ - **Novel Sequences**: May not generalize well to completely unseen RNA families or chemical scaffolds
115
+ - **Sequence Length**: Limited to 512 tokens (longer sequences are truncated)
116
+ - **SMILES Limitations**: May not capture all 3D molecular properties
117
+ - **Single Attention Head**: May limit capacity for complex interaction patterns
118
+
119
+ ## Scientific Applications
120
+
121
+ This tool can be used for:
122
+ - Drug discovery and design
123
+ - RNA-targeted therapeutics research
124
+ - Molecular interaction analysis
125
+ - Binding affinity prediction
126
+ - Structure-activity relationship studies
127
+ - Lead compound optimization
128
+
129
+ ## Technical Support
130
+
131
+ For technical issues or questions:
132
+ - Check model loading status in the Model Settings tab
133
+ - Ensure input sequences are properly formatted
134
+ - Verify SMILES notation validity
135
+ - Review example inputs for correct format
136
+
137
+ ## Data Sources
138
+
139
+ The model leverages:
140
+ - **RNA-BERTa**: Pre-trained on diverse RNA sequences
141
+ - **ChemBERTa-77M-MTR**: Trained on molecular property prediction tasks [1]
142
+ - **ROBIN Datasets**: External validation across multiple RNA classes [2]
143
+
144
+ For more detailed technical documentation, model architecture details, and programmatic usage, visit the [model repository](https://huggingface.co/IlPakoZ/DLRNA-BERTa9700).
145
+
146
+ ### Citations
147
+ [1]
148
+ ```bibtex
149
+ @article{ahmad2022chemberta,
150
+ title={Chemberta-2: Towards chemical foundation models},
151
+ author={Ahmad, Walid and Simon, Elana and Chithrananda, Seyone and Grand, Gabriel and Ramsundar, Bharath},
152
+ journal={arXiv preprint arXiv:2209.01712},
153
+ year={2022}
154
+ }
155
+ ```
156
+
157
+ [2]
158
+ ```bibtex
159
+ @article{krishnan2024reliable,
160
+ title={Reliable method for predicting the binding affinity of RNA-small molecule interactions using machine learning},
161
+ author={Krishnan, Sowmya R and Roy, Arijit and Gromiha, M Michael},
162
+ journal={Briefings in Bioinformatics},
163
+ volume={25},
164
+ number={2},
165
+ pages={bbae002},
166
+ year={2024},
167
+ publisher={Oxford University Press}
168
+ }
169
+ ```