Spaces:
Sleeping
Sleeping
commit 2
Browse files- download.py +25 -0
- download_corpus.sh +5 -0
- install_env.sh +6 -5
download.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
from huggingface_hub import hf_hub_download
|
| 3 |
+
|
| 4 |
+
parser = argparse.ArgumentParser(description="Download files from a Hugging Face dataset repository.")
|
| 5 |
+
parser.add_argument("--repo_id", type=str, default="PeterJinGo/wiki-18-e5-index", help="Hugging Face repository ID")
|
| 6 |
+
parser.add_argument("--save_path", type=str, required=True, help="Local directory to save files")
|
| 7 |
+
|
| 8 |
+
args = parser.parse_args()
|
| 9 |
+
|
| 10 |
+
repo_id = "PeterJinGo/wiki-18-e5-index"
|
| 11 |
+
for file in ["part_aa", "part_ab"]:
|
| 12 |
+
hf_hub_download(
|
| 13 |
+
repo_id=repo_id,
|
| 14 |
+
filename=file, # e.g., "e5_Flat.index"
|
| 15 |
+
repo_type="dataset",
|
| 16 |
+
local_dir=args.save_path,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
repo_id = "PeterJinGo/wiki-18-corpus"
|
| 20 |
+
hf_hub_download(
|
| 21 |
+
repo_id=repo_id,
|
| 22 |
+
filename="wiki-18.jsonl.gz",
|
| 23 |
+
repo_type="dataset",
|
| 24 |
+
local_dir=args.save_path,
|
| 25 |
+
)
|
download_corpus.sh
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
save_path=./data
|
| 2 |
+
python download.py --save_path $save_path || exit -1
|
| 3 |
+
mv $save_path/part_aa $save_path/e5_Flat.index
|
| 4 |
+
cat $save_path/part_ab >> $save_path/e5_Flat.index && rm $save_path/part_ab
|
| 5 |
+
gzip -d $save_path/wiki-18.jsonl.gz
|
install_env.sh
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
-
|
| 2 |
-
conda
|
|
|
|
| 3 |
|
| 4 |
-
conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia
|
| 5 |
pip install transformers datasets pyserini
|
| 6 |
|
| 7 |
-
conda install -c pytorch -c nvidia faiss-gpu=1.8.0
|
| 8 |
|
| 9 |
-
pip install uvicorn fastapi
|
|
|
|
| 1 |
+
# Run this code manually
|
| 2 |
+
# conda create -n faiss_env python=3.10
|
| 3 |
+
# conda activate faiss_env
|
| 4 |
|
| 5 |
+
conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia -y
|
| 6 |
pip install transformers datasets pyserini
|
| 7 |
|
| 8 |
+
conda install -c pytorch -c nvidia faiss-gpu=1.8.0 -y
|
| 9 |
|
| 10 |
+
pip install uvicorn fastapi huggingface_hub
|