Spaces:
Sleeping
Sleeping
| import contextlib | |
| import re | |
| import tempfile | |
| from functools import lru_cache | |
| from typing import Optional | |
| import gradio as gr | |
| from git import Repo | |
| from httpx import Client | |
| from huggingface_hub import create_repo, upload_folder | |
| from toolz import groupby | |
| import kagglehub | |
| from kagglehub import KaggleDatasetAdapter | |
| client = Client() | |
| def clone_into_temp_dir(github_repo_url): | |
| temp_dir = tempfile.TemporaryDirectory() | |
| return Repo.clone_from(github_repo_url, temp_dir), temp_dir | |
| # repo = clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/") | |
| # clone_into_temp_dir("https://github.com/chen-zichen/XplainLLM_dataset/") | |
| def upload_directory_to_hf( | |
| repo_id: str, | |
| directory: str, | |
| oauth_token: str, | |
| ): | |
| private = False | |
| url = create_repo( | |
| repo_id, | |
| token=oauth_token, | |
| exist_ok=True, | |
| repo_type="dataset", | |
| private=private, | |
| ) | |
| commit_url = upload_folder( | |
| repo_id=repo_id, | |
| folder_path=directory, | |
| path_in_repo="data", | |
| repo_type="dataset", | |
| token=oauth_token, | |
| commit_message="Migrated from GitHub", | |
| ignore_patterns=[ | |
| "*.git*", | |
| # "*README.md*", | |
| "*.DS_Store", | |
| "*.env", | |
| ], # ignore git files and .env files | |
| ) | |
| def push_to_hf( | |
| source_github_repository, | |
| destination_hf_hub_repository, | |
| subdirectory, | |
| oauth_token: gr.OAuthToken, | |
| ): | |
| gr.Info("Cloning source GitHub repository...") | |
| repo, temporary_directory = clone_into_temp_dir(source_github_repository) | |
| gr.Info("Cloning source GitHub repository...Done") | |
| gr.Info("Syncing with Hugging Face Hub...") | |
| if subdirectory: | |
| src_directory = f"{repo.working_dir}/{subdirectory[0]}" | |
| else: | |
| src_directory = repo.working_dir | |
| upload_directory_to_hf( | |
| repo_id=destination_hf_hub_repository, | |
| directory=src_directory, | |
| oauth_token=oauth_token.token, | |
| ) | |
| gr.Info("Syncing with Hugging Face Hub...Done") | |
| temporary_directory.cleanup() | |
| return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})" | |
| def extract_user_name_and_repo_from_url(github_url: str): | |
| pattern = r"https://github.com/([^/]+)/([^/]+)" | |
| if match := re.search(pattern, github_url): | |
| return match[1], match[2] | |
| print("No match found in the GitHub URL.") | |
| return None | |
| def get_files_and_directories(response): | |
| data = response.json() | |
| grouped_by_type = groupby(lambda item: item["type"], data["tree"]) | |
| files = grouped_by_type.get("blob", []) | |
| directories = grouped_by_type.get("tree", []) | |
| if files: | |
| files = [file["path"] for file in files] | |
| if directories: | |
| directories = [directory["path"] for directory in directories] | |
| return {"files": files, "directories": directories} | |
| def list_git_repo_files_and_directories(repo_url: str, branch: str = "main"): | |
| user_name_and_repo = extract_user_name_and_repo_from_url(repo_url) | |
| if user_name_and_repo is None: | |
| return None | |
| user_name, repo_name = user_name_and_repo | |
| url = f"https://api.github.com/repos/{user_name}/{repo_name}/git/trees/{branch}" | |
| response = client.get(url) | |
| if response.status_code == 200: | |
| return get_files_and_directories(response) | |
| def show_files_and_directories(url: str): | |
| with contextlib.suppress(Exception): | |
| files_and_directories = list_git_repo_files_and_directories(url) | |
| directories = files_and_directories.get("directories", []) | |
| files = files_and_directories.get("files", []) | |
| print(directories) | |
| return gr.Dropdown( | |
| label="Directories", | |
| choices=directories, | |
| max_choices=1, | |
| visible=True, | |
| interactive=True, | |
| multiselect=True, | |
| ), gr.Dropdown( | |
| label="Files", | |
| choices=files, | |
| max_choices=None, | |
| visible=True, | |
| interactive=True, | |
| multiselect=True, | |
| ) | |
| def push_kaggle_to_hf( | |
| source_kaggle_dataset: str, | |
| destination_hf_hub_repository: str, | |
| file_path: str, | |
| oauth_token: gr.OAuthToken, | |
| ): | |
| """Pushes a Kaggle dataset to HuggingFace Hub using the HF dataset adapter""" | |
| if not file_path: | |
| raise ValueError("File path must be specified for Kaggle datasets") | |
| gr.Info("Loading Kaggle dataset...") | |
| dataset = kagglehub.load_dataset( | |
| KaggleDatasetAdapter.HUGGING_FACE, | |
| source_kaggle_dataset, | |
| file_path, | |
| ) | |
| gr.Info("Loading Kaggle dataset...Done") | |
| gr.Info("Pushing to Hugging Face Hub...") | |
| dataset.push_to_hub( | |
| destination_hf_hub_repository, | |
| token=oauth_token.token, | |
| ) | |
| gr.Info("Pushing to Hugging Face Hub...Done") | |
| return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})" | |
| html_text_app_description = """ | |
| While GitHub and Kaggle are great platforms, the Hugging Face Datasets Hub is a better place to host and share datasets. | |
| Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are: | |
| <br> | |
| <ul> | |
| <li>Hosting for large datasets</li> | |
| <li>An interactive preview of your dataset</li> | |
| <li>Access to the dataset via many tools and libraries including; datasets, pandas, polars, dask and DuckDB</li> | |
| <li>Seamless integration with machine learning workflows</li> | |
| <li>Version control and dataset versioning</li> | |
| </ul> | |
| <br> | |
| This app will help you migrate datasets currently hosted on GitHub or Kaggle to the Hugging Face Datasets Hub. | |
| Make sure you consider the license of the dataset when migrating it to the Hugging Face Datasets Hub 🤗. | |
| <br> | |
| <br> | |
| <i>Note: the Kaggle implementation is experimental and may not work for all datasets. Feel free to open a PR to improve it!</i> | |
| """ | |
| with gr.Blocks(theme=gr.themes.Base()) as demo: | |
| gr.HTML( | |
| """<h1 style='text-align: center;'> Dataset Migration Tool</h1> | |
| <center><i> ✨ Migrate datasets to Hugging Face Hub in a few steps ✨</i></center>""" | |
| ) | |
| gr.HTML(html_text_app_description) | |
| with gr.Row(): | |
| gr.LoginButton(size="sm") | |
| with gr.Tabs() as tabs: | |
| with gr.Tab("GitHub"): | |
| gr.Markdown("### Location of existing dataset") | |
| gr.Markdown( | |
| "URL for the GitHub repository where the dataset is currently hosted" | |
| ) | |
| source_github_repository = gr.Textbox( | |
| lines=1, label="Source GitHub Repository URL" | |
| ) | |
| with gr.Accordion("Advanced Options", open=False): | |
| gr.Markdown("### Select files and folder to migrate") | |
| gr.Markdown( | |
| "(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated." | |
| ) | |
| folder_in_github_repo = gr.Dropdown( | |
| None, | |
| label="Folder in the GitHub Repository to migrate", | |
| allow_custom_value=True, | |
| visible=True, | |
| ) | |
| files_in_github_repo = gr.Dropdown( | |
| None, | |
| label="Files in GitHub Repository to migrate", | |
| allow_custom_value=True, | |
| visible=True, | |
| ) | |
| source_github_repository.change( | |
| show_files_and_directories, | |
| [source_github_repository], | |
| [folder_in_github_repo, files_in_github_repo], | |
| ) | |
| gr.Markdown("### Destination for your migrated dataset") | |
| destination_hf_hub_repository = gr.Textbox( | |
| label="Destination Hugging Face Repository", | |
| placeholder="i.e. <hugging face username>/<repository_name>", | |
| ) | |
| github_submit_btn = gr.Button("Migrate GitHub Dataset") | |
| github_result = gr.Markdown(label="Summary", visible=True) | |
| github_submit_btn.click( | |
| push_to_hf, | |
| [ | |
| source_github_repository, | |
| destination_hf_hub_repository, | |
| folder_in_github_repo, | |
| ], | |
| [github_result], | |
| ) | |
| with gr.Tab("Kaggle"): | |
| gr.Markdown("### Source Kaggle Dataset") | |
| gr.Markdown("Enter the Kaggle dataset name and file path") | |
| source_kaggle_dataset = gr.Textbox( | |
| lines=1, | |
| label="Source Kaggle Dataset", | |
| placeholder="username/dataset-name", | |
| ) | |
| kaggle_file_path = gr.Textbox( | |
| label="File path in dataset", | |
| placeholder="e.g., train.csv", | |
| info="Specify the file to migrate from the dataset", | |
| ) | |
| gr.Markdown("### Destination for your migrated dataset") | |
| kaggle_destination_hf_hub = gr.Textbox( | |
| label="Destination Hugging Face Repository", | |
| placeholder="i.e. <hugging face username>/<repository_name>", | |
| ) | |
| kaggle_submit_btn = gr.Button("Migrate Kaggle Dataset") | |
| kaggle_result = gr.Markdown(label="Summary", visible=True) | |
| kaggle_submit_btn.click( | |
| push_kaggle_to_hf, | |
| [ | |
| source_kaggle_dataset, | |
| kaggle_destination_hf_hub, | |
| kaggle_file_path, | |
| ], | |
| [kaggle_result], | |
| ) | |
| gr.Markdown( | |
| """You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card). | |
| If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)""" | |
| ) | |
| demo.launch() | |