Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitattributes +1 -35
.gitignore +7 -0
LICENSE +201 -0
README.md +147 -3
config/model_config.json +34 -0
eval/README.md +123 -0
eval/eval_compare_matrix.py +406 -0
eval/install_requirements.sh +1 -0
inference.py +85 -0
install_requirements.sh +3 -0
model/autoencoders.py +374 -0
model/ear_vae.py +112 -0
model/transformer.py +846 -0
pretrained_weight/ear_vae_44k.pyt +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ *.pyt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+# pretrained_weight/*
+results
+data/*
+__pycache__
+*.pyc
+docs
+images

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,147 @@
----
-license: apache-2.0
----

+# εar-VAE: High Fidelity Music Reconstruction Model
+This repository contains the official inference code for εar-VAE, aa 44.1 kHz music signal reconstruction model that rethinks and optimizes VAE training for audio. It targets two common weaknesses in existing open-source VAEs—phase accuracy and stereophonic spatial representation—by aligning objectives with auditory perception and introducing phase-aware training. Experiments show substantial improvements across diverse metrics, with particular strength in high-frequency harmonics and spatial characteristics.
+<p align="center">
+<img src="./images/all_compares.jpg" width=90%>
+<img src="./images/table.png" width=90%>
+</p>
+<p align="center">
+<em>Upper: Ablation study across our training components.</em> <em>Down: Cross-model metric comparison on the evaluation dataset.</em>
+</p>
+Why εar-VAE:
+- 🎧 Perceptual alignment: A K-weighting perceptual filter is applied before loss computation to better match human hearing.
+- 🔁 Phase-aware objectives: Two novel phase losses
+  - Stereo Correlation Loss for robust inter-channel coherence.
+  - Phase-Derivative Loss using Instantaneous Frequency and Group Delay for phase precision.
+- 🌈 Spectral supervision paradigm: Magnitude supervised across MSLR (Mid/Side/Left/Right) components, while phase is supervised only by LR (Left/Right), improving stability and fidelity.
+- 📈 44.1 kHz performance: Outperforms leading open-source models across diverse metrics.
+## 1. Installation
+Follow these steps to set up the environment and install the necessary dependencies.
+### Installation Steps
+1.  **Clone the repository:**
+    ```bash
+    git clone <your-repo-url>
+    cd ear_vae
+    ```
+2.  **Create and activate a conda environment:**
+    ```bash
+    conda create -n ear_vae python=3.8
+    conda activate ear_vae
+    ```
+3.  **Run the installation script:**
+    This script will install the remaining dependencies.
+    ```bash
+    bash install_requirements.sh
+    ```
+    This will install:
+    - `descript-audio-codec`
+    - `alias-free-torch`
+    - `ffmpeg < 7` (via conda)
+4.  **Download the model weight:**
+    You could download the model checkpoint from **[Hugging Face](https://huggingface.co/earlab/EAR_VAE)**
+## 2. Usage
+The `inference.py` script is used to process audio files from an input directory and save the reconstructed audio to an output directory.
+### Running Inference
+You can run the inference with the following command:
+```bash
+python inference.py --indir <input_directory> --outdir <output_directory> --model_path <path_to_model> --device <device>
+```
+### Command-Line Arguments
+-   `--indir`: (Optional) Path to the input directory containing audio files. Default: `./data`.
+-   `--outdir`: (Optional) Path to the output directory where reconstructed audio will be saved. Default: `./results`.
+-   `--model_path`: (Optional) Path to the pretrained model weights (`.pyt` file). Default: `./pretrained_weight/ear_vae_44k.pyt`.
+-   `--device`: (Optional) The device to run the model on (e.g., `cuda:0` or `cpu`). Defaults to `cuda:0` if available, otherwise `cpu`.
+### Example
+1.  Place your input audio files (e.g., `.wav`, `.mp3`) into the `data/` directory.
+2.  Run the inference script:
+    ```bash
+    python inference.py
+    ```
+    This will use the default paths. The reconstructed audio files will be saved in the `results/` directory.
+## 3. Project Structure
+```
+.
+├── README.md               # This file
+├── config/                 # For model configurations
+│   └── model_config.json
+├── data/                   # Default directory for input audio files
+├── eval/                   # Scripts for model evaluation
+│   ├── eval_compare_matrix.py
+│   ├── install_requirements.sh
+│   └── README.md
+├── inference.py            # Main script for running audio reconstruction
+├── install_requirements.sh # Installation script for dependencies
+├── model/                  # Contains the model architecture code
+│   ├── sa2vae.py
+│   ├── transformer.py
+│   └── vaegan.py
+├── pretrained_weight/      # Directory for pretrained model weights
+│   └── your_weight_here
+```
+## 4. Model Details
+The model is a Variational Autoencoder with a Generative Adversarial Network (VAE-GAN) structure.
+-   **Encoder**: An Oobleck-style encoder that downsamples the input audio into a latent representation.
+-   **Bottleneck**: A VAE bottleneck that introduces a probabilistic latent space, sampling from a learned mean and variance.
+-   **Decoder**: An Oobleck-style decoder that upsamples the latent representation back into an audio waveform.
+-   **Transformer**: A Continuous Transformer can optionally be placed in the bottleneck to further process the latent sequence.
+This architecture allows for efficient and high-quality audio reconstruction.
+## 5. Evaluation
+The `eval/` directory contains scripts to evaluate the model's reconstruction performance using objective metrics.
+### Evaluation Prerequisites
+1.  **Install Dependencies**: The evaluation script has its own set of dependencies. Install them by running the script in the `eval` directory:
+    ```bash
+    bash eval/install_requirements.sh
+    ```
+    This will install libraries such as `auraloss`.
+2.  **FFmpeg**: The script uses `ffmpeg` for loudness analysis. Make sure `ffmpeg` is installed and available in your system's PATH. You can install it via conda:
+    ```bash
+    conda install -c conda-forge 'ffmpeg<7'
+    ```
+### Running Evaluation
+The `eval_compare_matrix.py` script compares the reconstructed audio with the original ground truth files and computes various metrics.
+For more details on the evaluation metrics and options, refer to the `eval/README.md` file.
+## 6. Acknowledgements
+This project builds upon the work of several open-source projects. We would like to extend our special thanks to:
+-   **[Stability AI's Stable Audio Tools](https://github.com/Stability-AI/stable-audio-tools)**: For providing a foundational framework and tools for audio generation.
+-   **[Descript's Audio Codec](https://github.com/descriptinc/descript-audio-codec)**: For the weight-normed convolusional layers
+Their contributions have been invaluable to the development of εar-VAE.

config/model_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "transformer": {
+    "depth": 2,
+    "config": {
+      "rotary_pos_emb": true,
+      "dim_heads": 32
+    }
+  },
+  "encoder": {
+    "config": {
+      "in_channels": 2,
+      "channels": 128,
+      "c_mults": [1, 2, 4, 8, 16],
+      "strides": [2, 4, 4, 4, 8],
+      "latent_dim": 128,
+      "use_snake": true
+    }
+  },
+  "decoder": {
+    "config": {
+      "out_channels": 2,
+      "channels": 128,
+      "c_mults": [1, 2, 4, 8, 16],
+      "strides": [2, 4, 4, 4, 8],
+      "latent_dim": 64,
+      "use_nearest_upsample": false,
+      "use_snake": true,
+      "final_tanh": false
+    }
+  },
+  "latent_dim": 64,
+  "downsampling_ratio": 1024,
+  "io_channels": 2
+}

eval/README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+# VAE Audio Evaluation
+This directory contains the script and resources for evaluating the performance of models in audio reconstruction tasks. The primary script, `eval_compare_matrix.py`, computes a suite of objective metrics to compare the quality of audio generated by the model against the original ground truth audio.
+## Features
+- **Comprehensive Metrics**: Calculates a wide range of industry-standard and research-grade metrics:
+  - **Time-Domain**: Scale-Invariant Signal-to-Distortion Ratio (SI-SDR).
+  - **Frequency-Domain**: Multi-Resolution STFT Loss and Multi-Resolution Mel-Spectrogram Loss.
+  - **Phase**: Multi-Resolution Phase Coherence (both per-channel and inter-channel for stereo).
+  - **Loudness**: Integrated Loudness (LUFS-I), Loudness Range (LRA), and True Peak, analyzed using `ffmpeg`.
+- **Batch Processing**: Automatically discovers and processes multiple model output directories.
+- **File Matching**: Intelligently pairs reconstructed audio files (e.g., `*_vae_rec.wav`) with their corresponding ground truth files (e.g., `*.wav`).
+- **Robust & Resilient**: Handles missing files, audio processing errors, and varying sample rates gracefully.
+- **Organized Output**: Saves aggregated results in both machine-readable (`.json`) and human-readable (`.txt`) formats for each model.
+- **Command-Line Interface**: Easy-to-use CLI for specifying the input directory and other options.
+## Prerequisites
+### 1. Python Environment
+Ensure you have a Python environment (3.8 or newer recommended) with the required packages installed. You can install them using pip:
+```bash
+pip install torch torchaudio auraloss numpy
+```
+### 2. FFmpeg
+The script relies on `ffmpeg` for loudness analysis. You must have `ffmpeg` installed and accessible in your system's PATH.
+**On Ubuntu/Debian:**
+```bash
+sudo apt update && sudo apt install ffmpeg
+```
+**On macOS (using Homebrew):**
+```bash
+brew install ffmpeg
+```
+**On Windows:**
+Download the executable from the [official FFmpeg website](https://ffmpeg.org/download.html) and add its `bin` directory to your system's PATH environment variable.
+You can verify the installation by running:
+```bash
+ffmpeg -version
+```
+**Also On Conda ENv:**
+```bash
+conda install -c conda-forge 'ffmpeg<7'
+```
+## Directory Structure
+The script expects a specific directory structure for the evaluation data. The root input directory should contain subdirectories, where each subdirectory represents a different model or experiment to be evaluated.
+Inside each model's subdirectory, you should place the pairs of ground truth and reconstructed audio files. The script identifies pairs based on a naming convention:
+- **Ground Truth**: `your_audio_file.wav`
+- **Reconstructed**: `your_audio_file_vae_rec.wav`
+Here is an example structure:
+```
+/path/to/your/evaluation_data/
+├── model_A/
+│   ├── song1.wav           # Ground Truth 1
+│   ├── song1_vae_rec.wav   # Reconstructed 1
+│   ├── song2.wav           # Ground Truth 2
+│   ├── song2_vae_rec.wav   # Reconstructed 2
+│   └── ...
+├── model_B/
+│   ├── trackA.wav
+│   ├── trackA_vae_rec.wav
+│   └── ...
+└── ...
+```
+## Usage
+Run the evaluation script from the command line, pointing it to the root directory containing your model outputs.
+```bash
+python eval_compare_matrix.py --input_dir /path/to/your/evaluation_data/
+```
+### Command-Line Arguments
+- `--input_dir` (required): The path to the root directory containing the model folders (e.g., `/path/to/your/evaluation_data/`).
+- `--force` (optional): If specified, the script will re-run the evaluation for all models, even if results files (`evaluation_results.json`) already exist. By default, it skips models that have already been evaluated.
+- `--echo` (optional): If specified, the script will print the detailed evaluation metrics for each individual audio pair during processing. By default, only the progress bar and final summary are shown.
+### Example
+```bash
+python eval/eval_compare_matrix.py --input_dir ./results/
+```
+## Output
+After running, the script will generate two files inside each model's directory:
+1.  **`evaluation_results.json`**: A JSON file containing the aggregated average of all computed metrics. This is ideal for programmatic analysis.
+    ```json
+    {
+        "model_name": "model_A",
+        "file_count": 50,
+        "avg_sisdr": 15.78,
+        "avg_mel_distance": 0.45,
+        "avg_stft_distance": 0.89,
+        "avg_per_channel_coherence": 0.95,
+        "avg_interchannel_coherence": 0.92,
+        "avg_gen_lufs-i": -14.2,
+        "avg_gt_lufs-i": -14.0,
+        ...
+    }
+    ```
+2.  **`evaluation_summary.txt`**: A human-readable text file summarizing the results.
+    ```
+    model_name: model_A
+    file_count: 50
+    avg_sisdr: 15.78...
+    avg_mel_distance: 0.45...
+    ...
+    ```
+This allows for quick inspection of a model's performance without needing to parse the JSON.

eval/eval_compare_matrix.py ADDED Viewed

	@@ -0,0 +1,406 @@

+"""
+Audio Evaluation Script
+This script evaluates the quality of generated audio against ground truth audio
+using a variety of metrics, including:
+- SI-SDR (Scale-Invariant Signal-to-Distortion Ratio)
+- Multi-Resolution STFT Loss
+- Multi-Resolution Mel-Spectrogram Loss
+- Phase Coherence (Per-channel and Inter-channel)
+- Loudness metrics (LUFS-I, LRA, True Peak) via ffmpeg.
+The script processes a directory of models, where each model directory contains
+pairs of reconstructed (_rec.wav) and ground truth (.wav) audio files.
+"""
+import os
+import re
+import sys
+import json
+import logging
+import argparse
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torchaudio
+import auraloss
+from tqdm import tqdm
+# --- Setup ---
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    stream=sys.stdout
+)
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+SAMPLE_RATE = 44100
+# --- Metric Definitions ---
+# SI-SDR
+sisdr_criteria = auraloss.time.SISDRLoss().to(DEVICE)
+# Multi-Resolution Mel-Spectrogram Loss
+mel_fft_sizes = [4096, 2048, 1024, 512]
+mel_win_sizes = mel_fft_sizes
+mel_hop_sizes = [i // 4 for i in mel_fft_sizes]
+mel_criteria = auraloss.freq.MultiResolutionSTFTLoss(
+    fft_sizes=mel_fft_sizes,
+    hop_sizes=mel_hop_sizes,
+    win_lengths=mel_win_sizes,
+    sample_rate=SAMPLE_RATE,
+    scale="mel",
+    n_bins=64,
+    perceptual_weighting=True
+).to(DEVICE)
+# Multi-Resolution STFT Loss
+fft_sizes = [4096, 2048, 1024, 512, 256, 128]
+win_sizes = fft_sizes
+hop_sizes = [i // 4 for i in fft_sizes]
+stft_criteria = auraloss.freq.MultiResolutionSTFTLoss(
+    fft_sizes=fft_sizes,
+    hop_sizes=hop_sizes,
+    win_lengths=win_sizes,
+    sample_rate=SAMPLE_RATE,
+    perceptual_weighting=True
+).to(DEVICE)
+def analyze_loudness(file_path: str) -> Optional[Dict[str, float]]:
+    """
+    Analyzes audio file loudness using ffmpeg's ebur128 filter.
+    Args:
+        file_path: Path to the audio file.
+    Returns:
+        A dictionary with LUFS-I, LRA, and True Peak, or None on failure.
+    """
+    if not Path(file_path).exists():
+        logging.warning(f"Loudness analysis skipped: File not found at {file_path}")
+        return None
+    command = [
+        "ffmpeg",
+        "-nostats",
+        "-i", file_path,
+        "-af", "ebur128=peak=true,ametadata=mode=print:file=-",
+        "-f", "null",
+        "-"
+    ]
+    try:
+        result = subprocess.run(command, capture_output=True, text=True, check=True, encoding='utf-8')
+        output_text = result.stderr
+    except FileNotFoundError:
+        logging.error("ffmpeg not found. Please install ffmpeg and ensure it's in your PATH.")
+        return None
+    except subprocess.CalledProcessError as e:
+        logging.error(f"ffmpeg analysis failed for {file_path}. Error: {e.stderr}")
+        return None
+    loudness_data = {}
+    i_match = re.search(r"^\s*I:\s*(-?[\d\.]+)\s*LUFS", output_text, re.MULTILINE)
+    if i_match:
+        loudness_data['LUFS-I'] = float(i_match.group(1))
+    lra_match = re.search(r"^\s*LRA:\s*([\d\.]+)\s*LU", output_text, re.MULTILINE)
+    if lra_match:
+        loudness_data['LRA'] = float(lra_match.group(1))
+    tp_match = re.search(r"Peak:\s*(-?[\d\.]+)\s*dBFS", output_text, re.MULTILINE)
+    if tp_match:
+        loudness_data['True Peak'] = float(tp_match.group(1))
+    if not loudness_data:
+        logging.warning(f"Could not parse loudness data for {file_path}.")
+        return None
+    return loudness_data
+class PhaseCoherenceLoss(nn.Module):
+    """
+    Calculates phase coherence between two audio signals.
+    Adapted for stereo and multi-resolution analysis.
+    """
+    def __init__(self, fft_size, hop_size, win_size, mag_threshold=1e-6, eps=1e-8):
+        super().__init__()
+        self.fft_size = int(fft_size)
+        self.hop_size = int(hop_size)
+        self.win_size = int(win_size)
+        self.register_buffer("window", torch.hann_window(win_size))
+        self.mag_threshold = float(mag_threshold)
+        self.eps = float(eps)
+    def _to_complex(self, x):
+        if torch.is_complex(x):
+            return x
+        if x.dim() >= 1 and x.size(-1) == 2:
+            return torch.complex(x[..., 0], x[..., 1])
+        raise ValueError("Input must be complex or real/imag tensor.")
+    def _stereo_stft(self, x):
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+        B, C, T = x.shape
+        stft = torch.stft(x.reshape(B * C, T),
+                          n_fft=self.fft_size,
+                          hop_length=self.hop_size,
+                          win_length=self.win_size,
+                          window=self.window,
+                          return_complex=True)
+        return stft.view(B, C, -1, stft.size(-1))
+    def forward(self, pred, target):
+        pred_stft = self._stereo_stft(pred)
+        target_stft = self._stereo_stft(target)
+        pred_stft = self._to_complex(pred_stft)
+        target_stft = self._to_complex(target_stft)
+        B, C, F, T = pred_stft.shape
+        # magnitudes and weights
+        mag_pred = torch.abs(pred_stft)
+        mag_target = torch.abs(target_stft)
+        weights = mag_pred * mag_target
+        mask = (weights > self.mag_threshold).to(weights.dtype)
+        weights_masked = weights * mask
+        # phase difference Δφ = angle(pred) - angle(target)
+        delta = torch.angle(pred_stft) - torch.angle(target_stft)
+        # phasor e^{jΔφ}
+        phasor = torch.complex(torch.cos(delta), torch.sin(delta))
+        # weighted vector sum across frequency axis
+        num = torch.sum(weights_masked * phasor, dim=2) # [B, C, T], complex
+        den = torch.sum(weights_masked, dim=2).clamp_min(self.eps)
+        coherence_per_bin = torch.abs(num) / den
+        # pool across time (energy-weighted mean) -> per-channel scalar
+        # weight time pooling by per-frame energy sum to emphasize active frames
+        frame_energy = torch.sum(weights_masked, dim=2)
+        frame_energy_sum = torch.sum(frame_energy, dim=2).clamp_min(self.eps)
+        # energy-weighted average over time
+        coherence_chan = torch.sum(coherence_per_bin * frame_energy, dim=2) / frame_energy_sum
+        # mean across batch
+        per_channel_coherence = coherence_chan.mean(dim=0)
+        inter_coherence = None
+        if C >= 2:
+            Lp, Rp = pred_stft[:, 0], pred_stft[:, 1]
+            Lt, Rt = target_stft[:, 0], target_stft[:, 1]
+            # inter-channel phase: angle(L) - angle(R)  <=> angle(L * conj(R))
+            inter_delta = torch.angle(Lp * torch.conj(Rp)) - torch.angle(Lt * torch.conj(Rt))
+            inter_weights = torch.abs(Lp) * torch.abs(Rp)
+            inter_mask = (inter_weights > self.mag_threshold).to(inter_weights.dtype)
+            inter_weights_masked = inter_weights * inter_mask
+            inter_phasor = torch.complex(torch.cos(inter_delta), torch.sin(inter_delta))
+            inter_num = torch.sum(inter_weights_masked * inter_phasor, dim=1)
+            inter_den = torch.sum(inter_weights_masked, dim=1).clamp_min(self.eps)
+            inter_coh_time = torch.abs(inter_num) / inter_den
+            # pool across time weighted by energy
+            inter_frame_energy = torch.sum(inter_weights_masked, dim=1)
+            inter_energy_sum = inter_frame_energy.sum(dim=1).clamp_min(self.eps)
+            inter_coh_b = (inter_coh_time * inter_frame_energy).sum(dim=1) / inter_energy_sum
+            inter_coherence = inter_coh_b.mean()
+        return {
+            "per_channel_coherence": per_channel_coherence.detach().cpu(),
+            "interchannel_coherence": (inter_coherence.detach().cpu() if inter_coherence is not None else None),
+        }
+class MultiResolutionPhaseCoherenceLoss(nn.Module):
+    def __init__(self, fft_sizes, hop_sizes, win_sizes):
+        super().__init__()
+        self.criteria = nn.ModuleList([
+            PhaseCoherenceLoss(fft, hop, win) for fft, hop, win in zip(fft_sizes, hop_sizes, win_sizes)
+        ])
+    def forward(self, pred, target):
+        results = [criterion(pred, target) for criterion in self.criteria]
+        per_channel = torch.stack([r["per_channel_coherence"] for r in results]).mean(dim=0)
+        inter_items = [r["interchannel_coherence"] for r in results if r["interchannel_coherence"] is not None]
+        inter_channel = torch.stack(inter_items).mean() if inter_items else None
+        return {"per_channel_coherence": per_channel, "interchannel_coherence": inter_channel}
+phase_coherence_criteria = MultiResolutionPhaseCoherenceLoss(
+    fft_sizes=mel_fft_sizes, hop_sizes=mel_hop_sizes, win_sizes=mel_win_sizes
+).to(DEVICE)
+def find_audio_pairs(model_path: Path) -> List[Tuple[Path, Path]]:
+    """Finds pairs of reconstructed and ground truth audio files."""
+    rec_files = sorted(model_path.glob("*_vae_rec.wav"))
+    pairs = []
+    for rec_file in rec_files:
+        gt_file = model_path / rec_file.name.replace("_vae_rec.wav", ".wav")
+        if gt_file.exists():
+            pairs.append((rec_file, gt_file))
+        else:
+            logging.warning(f"Ground truth file not found for {rec_file.name}")
+    return pairs
+def evaluate_pair(rec_path: Path, gt_path: Path) -> Optional[Dict[str, float]]:
+    """Evaluates a single pair of audio files."""
+    try:
+        gen_wav, gen_sr = torchaudio.load(rec_path, backend="ffmpeg")
+        gt_wav, gt_sr = torchaudio.load(gt_path, backend="ffmpeg")
+        if gen_sr != SAMPLE_RATE:
+            gen_wav = torchaudio.transforms.Resample(gen_sr, SAMPLE_RATE)(gen_wav)
+        if gt_sr != SAMPLE_RATE:
+            gt_wav = torchaudio.transforms.Resample(gt_sr, SAMPLE_RATE)(gt_wav)
+        # Trim to same length
+        if gen_wav.shape[-1] != gt_wav.shape[-1]:
+            logging.info(f"Shape Mismatched, Trimming audio files to the same length: {rec_path.name}, {gt_path.name}")
+            min_len = min(gen_wav.shape[-1], gt_wav.shape[-1])
+            gen_wav, gt_wav = gen_wav[:, :min_len], gt_wav[:, :min_len]
+        gen_wav, gt_wav = gen_wav.to(DEVICE).unsqueeze(0), gt_wav.to(DEVICE).unsqueeze(0)
+        metrics = {}
+        metrics['sisdr'] = -sisdr_criteria(gen_wav, gt_wav).item()
+        metrics['mel_distance'] = mel_criteria(gen_wav, gt_wav).item()
+        metrics['stft_distance'] = stft_criteria(gen_wav, gt_wav).item()
+        phase_metrics = phase_coherence_criteria(gen_wav, gt_wav)
+        metrics['per_channel_coherence'] = phase_metrics["per_channel_coherence"].mean().item()
+        if phase_metrics["interchannel_coherence"] is not None:
+            metrics['interchannel_coherence'] = phase_metrics["interchannel_coherence"].item()
+        return metrics
+    except Exception as e:
+        logging.error(f"Error processing pair {rec_path.name}, {gt_path.name}: {e}")
+        return None
+def process_model(model_path: Path, force_eval: bool = False, echo=True):
+    """Processes all audio pairs for a given model."""
+    logging.info(f"Processing model: {model_path.name}")
+    results_file = model_path / "evaluation_results.json"
+    if results_file.exists() and not force_eval:
+        logging.info(f"Results already exist for {model_path.name}, skipping.")
+        return
+    audio_pairs = find_audio_pairs(model_path)
+    if not audio_pairs:
+        logging.warning(f"No valid audio pairs found for {model_path.name}.")
+        return
+    all_metrics = []
+    gen_loudness_data, gt_loudness_data = [], []
+    with torch.no_grad():
+        for rec_path, gt_path in tqdm(audio_pairs, desc=f"Evaluating {model_path.name}"):
+            pair_metrics = evaluate_pair(rec_path, gt_path)
+            if pair_metrics:
+                all_metrics.append(pair_metrics)
+            gen_loudness = analyze_loudness(str(rec_path))
+            if gen_loudness:
+                gen_loudness_data.append(gen_loudness)
+            gt_loudness = analyze_loudness(str(gt_path))
+            if gt_loudness:
+                gt_loudness_data.append(gt_loudness)
+            if echo:
+                logging.info(f"Metrics for {rec_path.name}: {pair_metrics}")
+                if gen_loudness:
+                    logging.info(f"Generated Loudness: {gen_loudness}")
+                if gt_loudness:
+                    logging.info(f"Ground Truth Loudness: {gt_loudness}")
+    if not all_metrics:
+        logging.warning(f"No metrics could be calculated for {model_path.name}.")
+        return
+    # Aggregate results
+    summary = {"model_name": model_path.name, "file_count": len(all_metrics)}
+    # Average objective metrics
+    metric_keys = all_metrics[0].keys()
+    for key in metric_keys:
+        valid_values = [m[key] for m in all_metrics if key in m]
+        if valid_values:
+            summary[f"avg_{key}"] = float(np.mean(valid_values))
+    # Average loudness metrics
+    def _avg_loudness(data: List[Dict[str, float]], prefix: str):
+        if not data: return
+        for key in data[0].keys():
+            values = [d[key] for d in data if key in d]
+            if values:
+                summary[f"avg_{prefix}_{key.lower().replace(' ', '_')}"] = float(np.mean(values))
+    _avg_loudness(gen_loudness_data, "gen")
+    _avg_loudness(gt_loudness_data, "gt")
+    # Save results
+    logging.info(f"Saving results for {model_path.name} to {results_file}")
+    with open(results_file, 'w') as f:
+        json.dump(summary, f, indent=4)
+    # Also save a human-readable version
+    with open(model_path / "evaluation_summary.txt", "w") as f:
+        for key, value in summary.items():
+            f.write(f"{key}: {value}\n")
+def main():
+    parser = argparse.ArgumentParser(description="Run evaluation on generated audio.")
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        required=True,
+        help="Root directory containing model output folders."
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force re-evaluation even if results files exist."
+    )
+    parser.add_argument(
+        "--echo",
+        action="store_true",
+        help="Echo per-file metrics to console during evaluation."
+    )
+    args = parser.parse_args()
+    root_path = Path(args.input_dir)
+    if not root_path.is_dir():
+        logging.error(f"Input directory not found: {root_path}")
+        sys.exit(1)
+    model_paths = [p for p in root_path.iterdir() if p.is_dir() and not p.name.startswith('.')]
+    logging.info(f"Found {len(model_paths)} model(s) to evaluate: {[p.name for p in model_paths]}")
+    for model_path in sorted(model_paths):
+        process_model(model_path, args.force, args.echo)
+    logging.info("Evaluation complete.")
+if __name__ == "__main__":
+    main()

eval/install_requirements.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip install torch torchaudio auraloss numpy

inference.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import torchaudio
+from pathlib import Path
+from tqdm import tqdm
+import torch
+import argparse
+import json
+from model.ear_vae import EAR_VAE
+def main(args):
+    indir = args.indir
+    model_path = args.model_path
+    outdir = args.outdir
+    device = args.device
+    config_path = args.config
+    print(f"Input directory: {indir}")
+    print(f"Model path: {model_path}")
+    print(f"Output directory: {outdir}")
+    print(f"Device: {device}")
+    print(f"Config path: {config_path}")
+    input_path = Path(indir)
+    output_path_dir = Path(outdir)
+    output_path_dir.mkdir(parents=True, exist_ok=True)
+    with open(config_path, 'r') as f:
+        vae_gan_model_config = json.load(f)
+    print("Loading model...")
+    model = EAR_VAE(model_config=vae_gan_model_config).to(device)
+    state = torch.load(model_path, map_location="cpu")
+    model.load_state_dict(state)
+    model.eval()
+    print("Model loaded successfully.")
+    audios = list(input_path.rglob("*"))
+    print(f"Found {len(audios)} audio files to process.")
+    with torch.no_grad():
+        for audio_path in tqdm(audios, desc="Processing audio files"):
+            try:
+                gt_y, sr = torchaudio.load(audio_path, backend="ffmpeg")
+                if len(gt_y.shape) == 1:
+                    gt_y = gt_y.unsqueeze(0)
+                # Resample if necessary
+                if sr != 44100:
+                    resampler = torchaudio.transforms.Resample(sr, 44100).to(device)
+                    gt_y = resampler(gt_y)
+                gt_y = gt_y.to(device, torch.float32)
+                # Convert to stereo if mono
+                if gt_y.shape[0] == 1:
+                    gt_y = torch.cat([gt_y, gt_y], dim=0)
+                # Add batch dimension
+                gt_y = gt_y.unsqueeze(0)
+                fake_audio = model.inference(gt_y)
+                output_filename = f"{Path(audio_path).stem}_{Path(model_path).stem}.wav"
+                output_path = output_path_dir / output_filename
+                fake_audio_processed = fake_audio.squeeze(0).cpu()
+                torchaudio.save(output_path, fake_audio_processed, sample_rate=44100, backend="ffmpeg")
+            except Exception as e:
+                print(f"Error processing {audio_path}: {e}")
+                continue
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Run VAE-GAN audio inference.")
+    parser.add_argument('--indir', type=str, default='./data', help='Input directory for audio files.')
+    parser.add_argument('--model_path', type=str, default='./pretrained_weight/ear_vae_44k.pyt', help='Path to the pretrained model weight.')
+    parser.add_argument('--outdir', type=str, default='./results', help='Output directory for generated audio files.')
+    parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu', help='Device to run the model on (e.g., "cuda:0" or "cpu").')
+    parser.add_argument('--config', type=str, default='./config/model_config.json', help='Path to the model config file.')
+    args = parser.parse_args()
+    main(args)

install_requirements.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+pip install descript-audio-codec
+python -m pip install alias-free-torch
+conda install -c conda-forge 'ffmpeg<7'

model/autoencoders.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import torch
+import math
+from torch import nn, pow
+from alias_free_torch import Activation1d
+from dac.nn.layers import WNConv1d, WNConvTranspose1d
+from typing import Literal
+def snake_beta(x, alpha, beta):
+    return x + (1.0 / (beta + 0.000000001)) * pow(torch.sin(x * alpha), 2)
+class SnakeBeta(nn.Module):
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = snake_beta(x, alpha, beta)
+        return x
+def checkpoint(function, *args, **kwargs):
+    kwargs.setdefault("use_reentrant", False)
+    return torch.utils.checkpoint.checkpoint(function, *args, **kwargs)
+def get_activation(
+    activation: Literal["elu", "snake", "none"], antialias=False, channels=None
+) -> nn.Module:
+    if activation == "elu":
+        act = nn.ELU()
+    elif activation == "snake":
+        act = SnakeBeta(channels)
+    elif activation == "none":
+        act = nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {activation}")
+    if antialias:
+        act = Activation1d(act)
+    return act
+class ResidualUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilation,
+        use_snake=False,
+        antialias_activation=False,
+        bias=True,
+    ):
+        super().__init__()
+        self.dilation = dilation
+        act = get_activation(
+            "snake" if use_snake else "elu",
+            antialias=antialias_activation,
+            channels=out_channels,
+        )
+        padding = (dilation * (7 - 1)) // 2
+        self.layers = nn.Sequential(
+            act,
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                dilation=dilation,
+                padding=padding,
+                bias=bias,
+            ),
+            act,
+            WNConv1d(
+                in_channels=out_channels, out_channels=out_channels, kernel_size=1, bias=bias
+            ),
+        )
+    def forward(self, x):
+        res = x
+        # x = checkpoint(self.layers, x)
+        x = self.layers(x)
+        return x + res
+class EncoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False,
+        bias=True,
+    ):
+        super().__init__()
+        act = get_activation(
+            "snake" if use_snake else "elu",
+            antialias=antialias_activation,
+            channels=in_channels,
+        )
+        self.layers = nn.Sequential(
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=1,
+                use_snake=use_snake,
+                bias=bias,
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=3,
+                use_snake=use_snake,
+                bias=bias,
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=9,
+                use_snake=use_snake,
+                bias=bias,
+            ),
+            act,
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                bias=bias,
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class AntiAliasUpsamplerBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=2, bias=True):
+        super().__init__()
+        self.upsample = nn.Upsample(scale_factor=stride, mode="nearest")
+        self.conv = WNConv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=2 * stride,
+            bias=bias,
+            padding="same",
+        )
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.conv(x)
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False,
+        bias=True,
+    ):
+        super().__init__()
+        if use_nearest_upsample:
+            upsample_layer = AntiAliasUpsamplerBlock(
+                in_channels=in_channels, out_channels=out_channels, stride=stride, bias=bias
+            )
+        else:
+            upsample_layer = WNConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                bias=bias,
+            )
+        act = get_activation(
+            "snake" if use_snake else "elu",
+            antialias=antialias_activation,
+            channels=in_channels,
+        )
+        self.layers = nn.Sequential(
+            act,
+            upsample_layer,
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=1,
+                use_snake=use_snake,
+                bias=bias,
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=3,
+                use_snake=use_snake,
+                bias=bias,
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=9,
+                use_snake=use_snake,
+                bias=bias,
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class OobleckEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False,
+        bias=True,
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=c_mults[0] * channels,
+                kernel_size=7,
+                padding=3,
+                bias=bias,
+            )
+        ]
+        for i in range(self.depth - 1):
+            layers += [
+                EncoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i + 1] * channels,
+                    stride=strides[i],
+                    use_snake=use_snake,
+                    bias=bias,
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[-1] * channels,
+            ),
+            WNConv1d(
+                in_channels=c_mults[-1] * channels,
+                out_channels=latent_dim,
+                kernel_size=3,
+                padding=1,
+                bias=bias,
+            ),
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class OobleckDecoder(nn.Module):
+    def __init__(
+        self,
+        out_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False,
+        final_tanh=True,
+        bias=True,
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=latent_dim,
+                out_channels=c_mults[-1] * channels,
+                kernel_size=7,
+                padding=3,
+                bias=bias,
+            ),
+        ]
+        for i in range(self.depth - 1, 0, -1):
+            layers += [
+                DecoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i - 1] * channels,
+                    stride=strides[i - 1],
+                    use_snake=use_snake,
+                    antialias_activation=antialias_activation,
+                    use_nearest_upsample=use_nearest_upsample,
+                    bias=bias,
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[0] * channels,
+            ),
+            WNConv1d(
+                in_channels=c_mults[0] * channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                padding=3,
+                bias=False,
+            ),
+            nn.Tanh() if final_tanh else nn.Identity(),
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)

model/ear_vae.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+import torch.nn as nn
+from torch import Tensor, nn, no_grad
+from .autoencoders import OobleckDecoder, OobleckEncoder
+from .transformer import ContinuousTransformer
+LRELU_SLOPE = 0.1
+padding_mode = "zeros"
+sample_eps = 1e-6
+def vae_sample(mean, scale):
+    stdev = nn.functional.softplus(scale)
+    var = stdev * stdev + sample_eps
+    logvar = torch.log(var)
+    latents = torch.randn_like(mean) * stdev + mean
+    kl = (mean * mean + var - logvar - 1).sum(1).mean()
+    return latents, kl
+class EAR_VAE(nn.Module):
+    def __init__(self, model_config: dict = None):
+        super().__init__()
+        if model_config is None:
+            model_config = {
+                "encoder": {
+                    "config": {
+                        "in_channels": 2,
+                        "channels": 128,
+                        "c_mults": [1, 2, 4, 8, 16],
+                        "strides": [2, 4, 4, 4, 8],
+                        "latent_dim": 128,
+                        "use_snake": True
+                    }
+                },
+                "decoder": {
+                    "config": {
+                        "out_channels": 2,
+                        "channels": 128,
+                        "c_mults": [1, 2, 4, 8, 16],
+                        "strides": [2, 4, 4, 4, 8],
+                        "latent_dim": 64,
+                        "use_nearest_upsample": False,
+                        "use_snake": True,
+                        "final_tanh": False,
+                    },
+                },
+                "latent_dim": 64,
+                "downsampling_ratio": 1024,
+                "io_channels": 2,
+            }
+        else:
+            model_config = model_config
+        if model_config.get("transformer") is not None:
+            self.transformers = ContinuousTransformer(
+                dim=model_config["decoder"]["config"]["latent_dim"],
+                depth=model_config["transformer"]["depth"],
+                **model_config["transformer"].get("config", {}),
+            )
+        else:
+            self.transformers = None
+        self.encoder = OobleckEncoder(**model_config["encoder"]["config"])
+        self.decoder = OobleckDecoder(**model_config["decoder"]["config"])
+    def forward(self, audio) -> Tensor:
+        """
+        audio: Input audio tensor [B,C,T]
+        """
+        status = self.encoder(audio)
+        mean, scale = status.chunk(2, dim=1)
+        z, kl = vae_sample(mean, scale)
+        if self.transformers is not None:
+            z = z.permute(0, 2, 1)
+            z = self.transformers(z)
+            z = z.permute(0, 2, 1)
+        x = self.decoder(z)
+        return x, kl
+    def encode(self, audio, use_sample=True):
+        x = self.encoder(audio)
+        mean, scale = x.chunk(2, dim=1)
+        if use_sample:
+            z, _ = vae_sample(mean, scale)
+        else:
+            z = mean
+        return z
+    def decode(self, z):
+        if self.transformers is not None:
+            z = z.permute(0, 2, 1)
+            z = self.transformers(z)
+            z = z.permute(0, 2, 1)
+        x = self.decoder(z)
+        return x
+    @no_grad()
+    def inference(self, audio):
+        z = self.encode(audio)
+        recon_audio = self.decode(z)
+        return recon_audio

model/transformer.py ADDED Viewed

	@@ -0,0 +1,846 @@

+from typing import Callable, Literal
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from packaging import version
+from torch import einsum, nn
+from torch.cuda.amp import autocast
+try:
+    from flash_attn import flash_attn_func, flash_attn_kvpacked_func
+    # flash_attn==2.3.3 is required
+except ImportError as e:
+    print(e)
+    print('flash_attn not installed, disabling Flash Attention')
+    flash_attn_kvpacked_func = None
+    flash_attn_func = None
+try:
+    import natten
+except ImportError:
+    natten = None
+import math
+from functools import reduce
+import numpy as np
+class FourierFeatures(nn.Module):
+    def __init__(self, in_features, out_features, std=1.):
+        super().__init__()
+        assert out_features % 2 == 0
+        self.weight = nn.Parameter(torch.randn(
+            [out_features // 2, in_features]) * std)
+    def forward(self, input):
+        f = 2 * math.pi * input @ self.weight.T
+        return torch.cat([f.cos(), f.sin()], dim=-1)
+def normalize(x, eps=1e-4):
+    dim = list(range(1, x.ndim))
+    n = torch.linalg.vector_norm(x, dim=dim, keepdim=True)
+    alpha = np.sqrt(n.numel() / x.numel())
+    return x / torch.add(eps, n, alpha=alpha)
+def checkpoint(function, *args, **kwargs):
+    kwargs.setdefault("use_reentrant", False)
+    return torch.utils.checkpoint.checkpoint(function, *args, **kwargs)
+def create_causal_mask(i, j, device):
+    return torch.ones((i, j), device=device, dtype=torch.bool).triu(j - i + 1)
+def or_reduce(masks):
+    head, *body = masks
+    for rest in body:
+        head = head | rest
+    return head
+# positional embeddings
+class AbsolutePositionalEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.max_seq_len = max_seq_len
+        self.emb = nn.Embedding(max_seq_len, dim)
+    def forward(self, x, pos=None, seq_start_pos=None):
+        seq_len, device = x.shape[1], x.device
+        assert seq_len <= self.max_seq_len, f'you are passing in a sequence length of {seq_len} but your absolute positional embedding has a max sequence length of {self.max_seq_len}'
+        if pos is None:
+            pos = torch.arange(seq_len, device=device)
+        if seq_start_pos is not None:
+            pos = (pos - seq_start_pos[..., None]).clamp(min=0)
+        pos_emb = self.emb(pos)
+        pos_emb = pos_emb * self.scale
+        return pos_emb
+class ScaledSinusoidalEmbedding(nn.Module):
+    def __init__(self, dim, theta=10000):
+        super().__init__()
+        assert (dim % 2) == 0, 'dimension must be divisible by 2'
+        self.scale = nn.Parameter(torch.ones(1) * dim ** -0.5)
+        half_dim = dim // 2
+        freq_seq = torch.arange(half_dim).float() / half_dim
+        inv_freq = theta ** -freq_seq
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+    def forward(self, x, pos=None, seq_start_pos=None):
+        seq_len, device = x.shape[1], x.device
+        if pos is None:
+            pos = torch.arange(seq_len, device=device)
+        if seq_start_pos is not None:
+            pos = pos - seq_start_pos[..., None]
+        emb = einsum('i, j -> i j', pos, self.inv_freq)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb * self.scale
+class RotaryEmbedding(nn.Module):
+    def __init__(
+            self,
+            dim,
+            use_xpos=False,
+            scale_base=512,
+            interpolation_factor=1.,
+            base=10000,
+            base_rescale_factor=1.
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        base *= base_rescale_factor ** (dim / (dim - 2))
+        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        assert interpolation_factor >= 1.
+        self.interpolation_factor = interpolation_factor
+        if not use_xpos:
+            self.register_buffer('scale', None)
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = scale_base
+        self.register_buffer('scale', scale)
+    def forward_from_seq_len(self, seq_len):
+        device = self.inv_freq.device
+        t = torch.arange(seq_len, device=device)
+        return self.forward(t)
+    @torch.amp.autocast('cuda', enabled=False)
+    def forward(self, t):
+        device = self.inv_freq.device
+        t = t.to(torch.float32)
+        t = t / self.interpolation_factor
+        freqs = torch.einsum('i , j -> i j', t, self.inv_freq)
+        freqs = torch.cat((freqs, freqs), dim=-1)
+        if self.scale is None:
+            return freqs, 1.
+        power = (torch.arange(seq_len, device=device) - (seq_len // 2)) / self.scale_base
+        scale = self.scale ** rearrange(power, 'n -> n 1')
+        scale = torch.cat((scale, scale), dim=-1)
+        return freqs, scale
+def rotate_half(x):
+    x = rearrange(x, '... (j d) -> ... j d', j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+@torch.amp.autocast('cuda', enabled=False)
+def apply_rotary_pos_emb(t, freqs, scale=1):
+    out_dtype = t.dtype
+    # cast to float32 if necessary for numerical stability
+    dtype = reduce(torch.promote_types, (t.dtype, freqs.dtype, torch.float32))
+    rot_dim, seq_len = freqs.shape[-1], t.shape[-2]
+    freqs, t = freqs.to(dtype), t.to(dtype)
+    freqs = freqs[-seq_len:, :]
+    if t.ndim == 4 and freqs.ndim == 3:
+        freqs = rearrange(freqs, 'b n d -> b 1 n d')
+    # partial rotary embeddings, Wang et al. GPT-J
+    t, t_unrotated = t[..., :rot_dim], t[..., rot_dim:]
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    t, t_unrotated = t.to(out_dtype), t_unrotated.to(out_dtype)
+    return torch.cat((t, t_unrotated), dim=-1)
+# norms
+class LayerNorm(nn.Module):
+    def __init__(self, dim, bias=False, fix_scale=False):
+        """
+        bias-less layernorm has been shown to be more stable. most newer models have moved towards rmsnorm, also bias-less
+        """
+        super().__init__()
+        if fix_scale:
+            self.register_buffer("gamma", torch.ones(dim))
+        else:
+            self.gamma = nn.Parameter(torch.ones(dim))
+        if bias:
+            self.beta = nn.Parameter(torch.zeros(dim))
+        else:
+            self.register_buffer("beta", torch.zeros(dim))
+    def forward(self, x):
+        return F.layer_norm(x, x.shape[-1:], weight=self.gamma, bias=self.beta)
+# feedforward
+class GLU(nn.Module):
+    def __init__(
+            self,
+            dim_in,
+            dim_out,
+            activation: Callable,
+            use_conv=False,
+            conv_kernel_size=3,
+    ):
+        super().__init__()
+        self.act = activation
+        self.proj = nn.Linear(dim_in, dim_out * 2) if not use_conv else nn.Conv1d(dim_in, dim_out * 2, conv_kernel_size,
+                                                                                  padding=(conv_kernel_size // 2))
+        self.use_conv = use_conv
+    def forward(self, x):
+        if self.use_conv:
+            x = rearrange(x, 'b n d -> b d n')
+            x = self.proj(x)
+            x = rearrange(x, 'b d n -> b n d')
+        else:
+            x = self.proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        return x * self.act(gate)
+class FeedForward(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_out=None,
+            mult=4,
+            no_bias=False,
+            glu=True,
+            use_conv=False,
+            conv_kernel_size=3,
+            zero_init_output=True,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        # Default to SwiGLU
+        activation = nn.SiLU()
+        dim_out = dim if dim_out is None else dim_out
+        if glu:
+            linear_in = GLU(dim, inner_dim, activation)
+        else:
+            linear_in = nn.Sequential(
+                Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+                nn.Linear(dim, inner_dim, bias=not no_bias) if not use_conv else nn.Conv1d(dim, inner_dim,
+                                                                                           conv_kernel_size, padding=(
+                                conv_kernel_size // 2), bias=not no_bias),
+                Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+                activation
+            )
+        linear_out = nn.Linear(inner_dim, dim_out, bias=not no_bias) if not use_conv else nn.Conv1d(inner_dim, dim_out,
+                                                                                                    conv_kernel_size,
+                                                                                                    padding=(
+                                                                                                                conv_kernel_size // 2),
+                                                                                                    bias=not no_bias)
+        # init last linear layer to 0
+        if zero_init_output:
+            nn.init.zeros_(linear_out.weight)
+            if not no_bias:
+                nn.init.zeros_(linear_out.bias)
+        self.ff = nn.Sequential(
+            linear_in,
+            Rearrange('b d n -> b n d') if use_conv else nn.Identity(),
+            linear_out,
+            Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+        )
+    def forward(self, x):
+        return self.ff(x)
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_heads=64,
+            dim_context=None,
+            causal=False,
+            zero_init_output=True,
+            qk_norm: Literal['l2', 'ln', 'none'] = 'none',
+            natten_kernel_size=None
+    ):
+        super().__init__()
+        self.dim = dim
+        self.dim_heads = dim_heads
+        self.causal = causal
+        dim_kv = dim_context if dim_context is not None else dim
+        self.num_heads = dim // dim_heads
+        self.kv_heads = dim_kv // dim_heads
+        if dim_context is not None:
+            self.to_q = nn.Linear(dim, dim, bias=False)
+            self.to_kv = nn.Linear(dim_kv, dim_kv * 2, bias=False)
+        else:
+            self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
+        self.to_out = nn.Linear(dim, dim, bias=False)
+        if zero_init_output:
+            nn.init.zeros_(self.to_out.weight)
+        self.qk_norm = qk_norm
+        if self.qk_norm == "ln":
+            self.q_norm = nn.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6)
+            self.k_norm = nn.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6)
+        # Using 1d neighborhood attention
+        self.natten_kernel_size = natten_kernel_size
+        if natten_kernel_size is not None:
+            return
+        self.use_pt_flash = torch.cuda.is_available() and version.parse(torch.__version__) >= version.parse('2.0.0')
+        self.use_fa_flash = torch.cuda.is_available() and flash_attn_func is not None
+        self.sdp_kwargs = dict(
+            enable_flash=True,
+            enable_math=True,
+            enable_mem_efficient=True
+        )
+    def flash_attn(
+            self,
+            q,
+            k,
+            v,
+            mask=None,
+            causal=None
+    ):
+        batch, heads, q_len, _, k_len, device = *q.shape, k.shape[-2], q.device
+        kv_heads = k.shape[1]
+        # Recommended for multi-query single-key-value attention by Tri Dao
+        # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
+        if heads != kv_heads:
+            # Repeat interleave kv_heads to match q_heads
+            heads_per_kv_head = heads // kv_heads
+            k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim=1), (k, v))
+        if k.ndim == 3:
+            k = rearrange(k, 'b ... -> b 1 ...').expand_as(q)
+        if v.ndim == 3:
+            v = rearrange(v, 'b ... -> b 1 ...').expand_as(q)
+        causal = self.causal if causal is None else causal
+        if q_len == 1 and causal:
+            causal = False
+        if mask is not None:
+            assert mask.ndim == 4
+            mask = mask.expand(batch, heads, q_len, k_len)
+        # handle kv cache - this should be bypassable in updated flash attention 2
+        if k_len > q_len and causal:
+            causal_mask = self.create_causal_mask(q_len, k_len, device=device)
+            if mask is None:
+                mask = ~causal_mask
+            else:
+                mask = mask & ~causal_mask
+            causal = False
+        # manually handle causal mask, if another mask was given
+        row_is_entirely_masked = None
+        if mask is not None and causal:
+            causal_mask = self.create_causal_mask(q_len, k_len, device=device)
+            mask = mask & ~causal_mask
+            # protect against an entire row being masked out
+            row_is_entirely_masked = ~mask.any(dim=-1)
+            mask[..., 0] = mask[..., 0] | row_is_entirely_masked
+            causal = False
+        with torch.backends.cuda.sdp_kernel(**self.sdp_kwargs):
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=mask,
+                is_causal=causal
+            )
+        # for a row that is entirely masked out, should zero out the output of that row token
+        if row_is_entirely_masked is not None:
+            out = out.masked_fill(row_is_entirely_masked[..., None], 0.)
+        return out
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+            context_mask=None,
+            rotary_pos_emb=None,
+            causal=None
+    ):
+        h, kv_h, has_context = self.num_heads, self.kv_heads, context is not None
+        kv_input = context if has_context else x
+        if hasattr(self, 'to_q'):
+            # Use separate linear projections for q and k/v
+            q = self.to_q(x)
+            q = rearrange(q, 'b n (h d) -> b h n d', h=h)
+            k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+            k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=kv_h), (k, v))
+        else:
+            # Use fused linear projection
+            q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
+        # Normalize q and k for cosine sim attention
+        if self.qk_norm == "l2":
+            q = F.normalize(q, dim=-1)
+            k = F.normalize(k, dim=-1)
+        elif self.qk_norm == "ln":
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+        if rotary_pos_emb is not None and not has_context:
+            freqs, _ = rotary_pos_emb
+            q_dtype = q.dtype
+            k_dtype = k.dtype
+            q = q.to(torch.float32)
+            k = k.to(torch.float32)
+            freqs = freqs.to(torch.float32)
+            q = apply_rotary_pos_emb(q, freqs)
+            k = apply_rotary_pos_emb(k, freqs)
+            q = q.to(q_dtype)
+            k = k.to(k_dtype)
+        input_mask = context_mask
+        if input_mask is None and not has_context:
+            input_mask = mask
+        # determine masking
+        masks = []
+        final_attn_mask = None  # The mask that will be applied to the attention matrix, taking all masks into account
+        if input_mask is not None:
+            input_mask = rearrange(input_mask, 'b j -> b 1 1 j')
+            masks.append(~input_mask)
+        # Other masks will be added here later
+        if len(masks) > 0:
+            final_attn_mask = ~or_reduce(masks)
+        n, device = q.shape[-2], q.device
+        causal = self.causal if causal is None else causal
+        if n == 1 and causal:
+            causal = False
+        if self.natten_kernel_size is not None:
+            if natten is None:
+                raise ImportError('natten not installed, please install natten to use neighborhood attention')
+            dtype_in = q.dtype
+            q, k, v = map(lambda t: t.to(torch.float32), (q, k, v))
+            attn = natten.functional.natten1dqk(q, k, kernel_size=self.natten_kernel_size, dilation=1)
+            if final_attn_mask is not None:
+                attn = attn.masked_fill(final_attn_mask, -torch.finfo(attn.dtype).max)
+            attn = F.softmax(attn, dim=-1, dtype=torch.float32)
+            out = natten.functional.natten1dav(attn, v, kernel_size=self.natten_kernel_size, dilation=1).to(dtype_in)
+        # Prioritize Flash Attention 2
+        elif self.use_fa_flash:
+            assert final_attn_mask is None, 'masking not yet supported for Flash Attention 2'
+            # Flash Attention 2 requires FP16 inputs
+            fa_dtype_in = q.dtype
+            q, k, v = map(lambda t: rearrange(t, 'b h n d -> b n h d').to(torch.float16), (q, k, v))
+            out = flash_attn_func(q, k, v, causal=causal)
+            out = rearrange(out.to(fa_dtype_in), 'b n h d -> b h n d')
+        # Fall back to PyTorch implementation
+        elif self.use_pt_flash:
+            out = self.flash_attn(q, k, v, causal=causal, mask=final_attn_mask)
+        else:
+            # Fall back to custom implementation
+            if h != kv_h:
+                # Repeat interleave kv_heads to match q_heads
+                heads_per_kv_head = h // kv_h
+                k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim=1), (k, v))
+            scale = 1. / (q.shape[-1] ** 0.5)
+            kv_einsum_eq = 'b j d' if k.ndim == 3 else 'b h j d'
+            dots = einsum(f'b h i d, {kv_einsum_eq} -> b h i j', q, k) * scale
+            i, j, dtype = *dots.shape[-2:], dots.dtype
+            mask_value = -torch.finfo(dots.dtype).max
+            if final_attn_mask is not None:
+                dots = dots.masked_fill(~final_attn_mask, mask_value)
+            if causal:
+                causal_mask = self.create_causal_mask(i, j, device=device)
+                dots = dots.masked_fill(causal_mask, mask_value)
+            attn = F.softmax(dots, dim=-1, dtype=torch.float32)
+            attn = attn.type(dtype)
+            out = einsum(f'b h i j, {kv_einsum_eq} -> b h i d', attn, v)
+        # merge heads
+        out = rearrange(out, ' b h n d -> b n (h d)')
+        out = self.to_out(out)
+        if mask is not None:
+            mask = rearrange(mask, 'b n -> b n 1')
+            out = out.masked_fill(~mask, 0.)
+        return out
+class ConformerModule(nn.Module):
+    def __init__(
+            self,
+            dim,
+            norm_kwargs={},
+    ):
+        super().__init__()
+        self.dim = dim
+        self.in_norm = LayerNorm(dim, **norm_kwargs)
+        self.pointwise_conv = nn.Conv1d(dim, dim, kernel_size=1, bias=False)
+        self.glu = GLU(dim, dim, nn.SiLU())
+        self.depthwise_conv = nn.Conv1d(dim, dim, kernel_size=17, groups=dim, padding=8, bias=False)
+        self.mid_norm = LayerNorm(dim,
+                                  **norm_kwargs)  # This is a batch norm in the original but I don't like batch norm
+        self.swish = nn.SiLU()
+        self.pointwise_conv_2 = nn.Conv1d(dim, dim, kernel_size=1, bias=False)
+    def forward(self, x):
+        x = self.in_norm(x)
+        x = rearrange(x, 'b n d -> b d n')
+        x = self.pointwise_conv(x)
+        x = rearrange(x, 'b d n -> b n d')
+        x = self.glu(x)
+        x = rearrange(x, 'b n d -> b d n')
+        x = self.depthwise_conv(x)
+        x = rearrange(x, 'b d n -> b n d')
+        x = self.mid_norm(x)
+        x = self.swish(x)
+        x = rearrange(x, 'b n d -> b d n')
+        x = self.pointwise_conv_2(x)
+        x = rearrange(x, 'b d n -> b n d')
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_heads=64,
+            cross_attend=False,
+            dim_context=None,
+            global_cond_dim=None,
+            causal=False,
+            zero_init_branch_outputs=True,
+            conformer=False,
+            layer_ix=-1,
+            remove_norms=False,
+            attn_kwargs={},
+            ff_kwargs={},
+            norm_kwargs={}
+    ):
+        super().__init__()
+        self.dim = dim
+        self.dim_heads = dim_heads
+        self.cross_attend = cross_attend
+        self.dim_context = dim_context
+        self.causal = causal
+        self.pre_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity()
+        self.self_attn = Attention(
+            dim,
+            dim_heads=dim_heads,
+            causal=causal,
+            zero_init_output=zero_init_branch_outputs,
+            **attn_kwargs
+        )
+        if cross_attend:
+            self.cross_attend_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity()
+            self.cross_attn = Attention(
+                dim,
+                dim_heads=dim_heads,
+                dim_context=dim_context,
+                causal=causal,
+                zero_init_output=zero_init_branch_outputs,
+                **attn_kwargs
+            )
+        self.ff_norm = LayerNorm(dim, **norm_kwargs) if not remove_norms else nn.Identity()
+        self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, **ff_kwargs)
+        self.layer_ix = layer_ix
+        self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None
+        self.global_cond_dim = global_cond_dim
+        if global_cond_dim is not None:
+            self.to_scale_shift_gate = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(global_cond_dim, dim * 6, bias=False)
+            )
+            nn.init.zeros_(self.to_scale_shift_gate[1].weight)
+            # nn.init.zeros_(self.to_scale_shift_gate_self[1].bias)
+    def forward(
+            self,
+            x,
+            context=None,
+            global_cond=None,
+            mask=None,
+            context_mask=None,
+            rotary_pos_emb=None
+    ):
+        if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None:
+            scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate(
+                global_cond).unsqueeze(1).chunk(6, dim=-1)
+            # self-attention with adaLN
+            residual = x
+            x = self.pre_norm(x)
+            x = x * (1 + scale_self) + shift_self
+            x = self.self_attn(x, mask=mask, rotary_pos_emb=rotary_pos_emb)
+            x = x * torch.sigmoid(1 - gate_self)
+            x = x + residual
+            if context is not None:
+                x = x + self.cross_attn(self.cross_attend_norm(x), context=context, context_mask=context_mask)
+            if self.conformer is not None:
+                x = x + self.conformer(x)
+            # feedforward with adaLN
+            residual = x
+            x = self.ff_norm(x)
+            x = x * (1 + scale_ff) + shift_ff
+            x = self.ff(x)
+            x = x * torch.sigmoid(1 - gate_ff)
+            x = x + residual
+        else:
+            x = x + self.self_attn(self.pre_norm(x), mask=mask, rotary_pos_emb=rotary_pos_emb)
+            if context is not None:
+                x = x + self.cross_attn(self.cross_attend_norm(x), context=context, context_mask=context_mask)
+            if self.conformer is not None:
+                x = x + self.conformer(x)
+            x = x + self.ff(self.ff_norm(x))
+        return x
+class ContinuousTransformer(nn.Module):
+    def __init__(
+            self,
+            dim,
+            depth,
+            *,
+            dim_in=None,
+            dim_out=None,
+            dim_heads=64,
+            cross_attend=False,
+            cond_token_dim=None,
+            global_cond_dim=None,
+            causal=False,
+            rotary_pos_emb=True,
+            zero_init_branch_outputs=True,
+            conformer=False,
+            use_sinusoidal_emb=False,
+            use_abs_pos_emb=False,
+            abs_pos_emb_max_length=10000,
+            **kwargs
+    ):
+        super().__init__()
+        self.dim = dim
+        self.depth = depth
+        self.causal = causal
+        self.layers = nn.ModuleList([])
+        self.project_in = nn.Linear(dim_in, dim, bias=False) if dim_in is not None else nn.Identity()
+        self.project_out = nn.Linear(dim, dim_out, bias=False) if dim_out is not None else nn.Identity()
+        if rotary_pos_emb:
+            self.rotary_pos_emb = RotaryEmbedding(max(dim_heads // 2, 32))
+        else:
+            self.rotary_pos_emb = None
+        self.use_sinusoidal_emb = use_sinusoidal_emb
+        if use_sinusoidal_emb:
+            self.pos_emb = ScaledSinusoidalEmbedding(dim)
+        self.use_abs_pos_emb = use_abs_pos_emb
+        if use_abs_pos_emb:
+            self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length)
+        for i in range(depth):
+            self.layers.append(
+                TransformerBlock(
+                    dim,
+                    dim_heads=dim_heads,
+                    cross_attend=cross_attend,
+                    dim_context=cond_token_dim,
+                    global_cond_dim=global_cond_dim,
+                    causal=causal,
+                    zero_init_branch_outputs=zero_init_branch_outputs,
+                    conformer=conformer,
+                    layer_ix=i,
+                    **kwargs
+                )
+            )
+    def forward(
+            self,
+            x,
+            mask=None,
+            prepend_embeds=None,
+            prepend_mask=None,
+            global_cond=None,
+            return_info=False,
+            **kwargs
+    ):
+        batch, seq, device = *x.shape[:2], x.device
+        info = {
+            "hidden_states": [],
+        }
+        x = self.project_in(x)
+        if prepend_embeds is not None:
+            prepend_length, prepend_dim = prepend_embeds.shape[1:]
+            assert prepend_dim == x.shape[-1], 'prepend dimension must match sequence dimension'
+            x = torch.cat((prepend_embeds, x), dim=-2)
+            if prepend_mask is not None or mask is not None:
+                mask = mask if mask is not None else torch.ones((batch, seq), device=device, dtype=torch.bool)
+                prepend_mask = prepend_mask if prepend_mask is not None else torch.ones((batch, prepend_length),
+                                                                                        device=device, dtype=torch.bool)
+                mask = torch.cat((prepend_mask, mask), dim=-1)
+        # Attention layers
+        if self.rotary_pos_emb is not None:
+            rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1])
+        else:
+            rotary_pos_emb = None
+        if self.use_sinusoidal_emb or self.use_abs_pos_emb:
+            x = x + self.pos_emb(x)
+        # Iterate over the transformer layers
+        for layer in self.layers:
+            # x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
+            x = checkpoint(layer, x, rotary_pos_emb=rotary_pos_emb, global_cond=global_cond, **kwargs)
+            if return_info:
+                info["hidden_states"].append(x)
+        x = self.project_out(x)
+        if return_info:
+            return x, info
+        return x

pretrained_weight/ear_vae_44k.pyt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0362dc7e96566869747dbe079b0a6d71c090b0a3a5d5077779e7be17c096d9d5
+size 591453838