Update app.py
Browse files
app.py
CHANGED
|
@@ -16,13 +16,13 @@ def generate_query(document):
|
|
| 16 |
input_ids = llm_tokenizer.encode(prompt, return_tensors="pt")
|
| 17 |
output = llm.generate(
|
| 18 |
input_ids,
|
| 19 |
-
|
| 20 |
num_return_sequences=5,
|
| 21 |
-
num_beams=5,
|
| 22 |
no_repeat_ngram_size=2,
|
| 23 |
early_stopping=True
|
| 24 |
)
|
| 25 |
-
queries = [llm_tokenizer.decode(seq, skip_special_tokens=True) for seq in output]
|
| 26 |
return queries
|
| 27 |
|
| 28 |
def rerank_pairs(queries, document):
|
|
@@ -46,12 +46,38 @@ def inpars_v2(document):
|
|
| 46 |
result = train_retriever([(best_query, document)])
|
| 47 |
return f"Generated query: {best_query}\n\n{result}"
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
iface = gr.Interface(
|
| 50 |
fn=inpars_v2,
|
| 51 |
inputs=gr.Textbox(lines=5, label="Input Document"),
|
| 52 |
outputs=gr.Textbox(label="Result"),
|
| 53 |
title="InPars-v2 Demo",
|
| 54 |
-
description=
|
|
|
|
| 55 |
)
|
| 56 |
|
| 57 |
iface.launch()
|
|
|
|
| 16 |
input_ids = llm_tokenizer.encode(prompt, return_tensors="pt")
|
| 17 |
output = llm.generate(
|
| 18 |
input_ids,
|
| 19 |
+
max_new_tokens=30,
|
| 20 |
num_return_sequences=5,
|
| 21 |
+
num_beams=5,
|
| 22 |
no_repeat_ngram_size=2,
|
| 23 |
early_stopping=True
|
| 24 |
)
|
| 25 |
+
queries = [llm_tokenizer.decode(seq[input_ids.shape[1]:], skip_special_tokens=True) for seq in output]
|
| 26 |
return queries
|
| 27 |
|
| 28 |
def rerank_pairs(queries, document):
|
|
|
|
| 46 |
result = train_retriever([(best_query, document)])
|
| 47 |
return f"Generated query: {best_query}\n\n{result}"
|
| 48 |
|
| 49 |
+
# Markdown description of the InPars-v2 paper
|
| 50 |
+
paper_description = """
|
| 51 |
+
# InPars-v2: Large Language Models as Efficient Dataset Generators for Information Retrieval
|
| 52 |
+
|
| 53 |
+
**Abstract Link:** [https://arxiv.org/abs/2301.01820](https://arxiv.org/abs/2301.01820)
|
| 54 |
+
**PDF Link:** [https://arxiv.org/pdf/2301.01820](https://arxiv.org/pdf/2301.01820)
|
| 55 |
+
|
| 56 |
+
**Authors:** Vitor Jeronymo, Luiz Bonifacio, Hugo Abonizio, Marzieh Fadaee, Roberto Lotufo, Jakub Zavrel, Rodrigo Nogueira
|
| 57 |
+
|
| 58 |
+
**Publication Date:** 26 May 2023
|
| 59 |
+
|
| 60 |
+
## Abstract
|
| 61 |
+
|
| 62 |
+
Recently, InPars introduced a method to efficiently use large language models (LLMs) in information retrieval tasks: via few-shot examples, an LLM is induced to generate relevant queries for documents. These synthetic query-document pairs can then be used to train a retriever. However, InPars and, more recently, Promptagator, rely on proprietary LLMs such as GPT-3 and FLAN to generate such datasets. In this work we introduce InPars-v2, a dataset generator that uses open-source LLMs and existing powerful rerankers to select synthetic query-document pairs for training. A simple BM25 retrieval pipeline followed by a monoT5 reranker finetuned on InPars-v2 data achieves new state-of-the-art results on the BEIR benchmark. To allow researchers to further improve our method, we open source the code, synthetic data, and finetuned models: [https://github.com/zetaalphavector/inPars/tree/master/tpu](https://github.com/zetaalphavector/inPars/tree/master/tpu)
|
| 63 |
+
|
| 64 |
+
## Key Features of InPars-v2
|
| 65 |
+
|
| 66 |
+
1. Uses open-source LLMs for query generation
|
| 67 |
+
2. Employs powerful rerankers to select high-quality synthetic query-document pairs
|
| 68 |
+
3. Achieves state-of-the-art results on the BEIR benchmark
|
| 69 |
+
4. Provides open-source code, synthetic data, and finetuned models
|
| 70 |
+
|
| 71 |
+
This demo provides a simplified implementation of the InPars-v2 concept, showcasing query generation, reranking, and retriever training.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
iface = gr.Interface(
|
| 75 |
fn=inpars_v2,
|
| 76 |
inputs=gr.Textbox(lines=5, label="Input Document"),
|
| 77 |
outputs=gr.Textbox(label="Result"),
|
| 78 |
title="InPars-v2 Demo",
|
| 79 |
+
description=paper_description,
|
| 80 |
+
article="This is a minimal implementation of the InPars-v2 concept. For the full implementation and more details, please refer to the original paper and GitHub repository."
|
| 81 |
)
|
| 82 |
|
| 83 |
iface.launch()
|