# Evaluation of a QA System

In [None]:
# Install the latest release of Haystack in your own environment 
# ! pip install farm-haystack

# Install the latest master of Haystack
!pip install grpcio-tools==1.32.0
!pip install grpcio==1.32.0
!pip install git+https://github.com/deepset-ai/haystack.git
 

In [None]:
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
 stdout=PIPE, stderr=STDOUT,
 preexec_fn=lambda: os.setuid(1) # as daemon
 )
# wait until ES has started
! sleep 30

In [None]:
from haystack.modeling.utils import initialize_device_settings

device, n_gpu = initialize_device_settings(use_cuda=True)

In [None]:
# Connect to Elasticsearch
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
 create_index=True, embedding_field="emb",
 embedding_dim=768, excluded_meta_data=["emb"], timeout=300000)

In [None]:
from haystack.preprocessor.utils import convert_files_to_dicts

In [None]:
from haystack.preprocessor.utils import convert_files_to_dicts
dicts = convert_files_to_dicts('../input/arabicwikinew/arabic-wiki',split_paragraphs=True)
document_store.write_documents(dicts,batch_size = 10000,duplicate_documents ='skip')

In [None]:
document_store.get_document_count()

In [None]:
# by BM25 retreiver
document_store.query('أين تقع فلسطين ؟',top_k = 2)

In [None]:
import pandas as pd
pd.read_json('../input/tydiaqatestset/tydiqa-goldp-dev-arabic.json')['data'][0]

In [None]:
from haystack.reader.farm import FARMReader

reader = FARMReader("wissamantoun/araelectra-base-artydiqa", top_k=3, return_no_answer=False)

In [None]:
from haystack.eval import EvalAnswers, EvalDocuments

# Here we initialize the nodes that perform evaluation
eval_retriever = EvalDocuments(debug=True,top_k=10)
eval_reader = EvalAnswers(sas_model="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",open_domain=True,debug=True)

In [None]:
doc_index = "eval_docs"
label_index = "eval_labels"

In [None]:
from haystack.preprocessor import PreProcessor

# Add evaluation data to Elasticsearch Document Store
# We first delete the custom tutorial indices to not have duplicate elements
# and also split our documents into shorter passages using the PreProcessor
preprocessor = PreProcessor(
 split_length=500,
 split_overlap=0,
 split_respect_sentence_boundary=False,
 clean_empty_lines=False,
 clean_whitespace=False
)
# document_store.delete_all_documents(index=doc_index)
# document_store.delete_all_documents(index=label_index)
document_store.add_eval_data(
 filename="../input/tydiaqatestset/tydiqa-goldp-dev-arabic.json",
 doc_index=doc_index,
 label_index=label_index,
 preprocessor=preprocessor
)

# Let's prepare the labels that we need for the retriever and the reader
labels = document_store.get_all_labels_aggregated(index=label_index)


In [None]:
len(labels)

In [None]:
print(document_store.get_document_count(index = 'eval_docs'),document_store.get_document_count(index = 'document'))

## Initialize components of QA-System

In [None]:
# Initialize Retriever
from haystack.retriever.dense import DensePassageRetriever
retriever = DensePassageRetriever(document_store=document_store,
 query_embedding_model="../input/dprarcdtydiqa/saved_models_DPR_tydiqa+arcd/query_encoder",
 passage_embedding_model="../input/dprarcdtydiqa/saved_models_DPR_tydiqa+arcd/passage_encoder",
 use_gpu=True,
 embed_title=True,
 batch_size=4)
document_store.update_embeddings(retriever=retriever,update_existing_embeddings=False,batch_size=10000)

In [None]:
print(document_store.get_document_count(index = 'eval_docs'),document_store.get_document_count(index = 'document'))

# Get examples of what retreiver retreive

In [None]:
retriever.save('retriever')

In [None]:
retriever.run_query('أين تقع فلسطين',top_k = 2)

## Evaluation of Retriever
Here we evaluate only the retriever, based on whether the gold_label document is retrieved.

In [None]:
document_store.update_embeddings(retriever, index=doc_index)

In [None]:
# ## Evaluate Retriever on its own
# retriever_eval_results = retriever.eval(top_k=20,open_domain=True,return_preds=True)
# ## Retriever Recall is the proportion of questions for which the correct document containing the answer is
# ## among the correct documents
# print("Retriever Recall:", retriever_eval_results['metrics']['recall'])
# ## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
# print("Retriever Mean Avg Precision:", retriever_eval_results['metrics']["map"])


In [None]:
def get_list_of_passages(doc,top_k):
 docs_list = []
 for i in range(top_k):
# print(doc[i])
 docs_list.append(doc[i].to_dict()['content'])
 return docs_list

In [None]:
import json
def accuracy_retriever(retriever, dataset):
 with open(dataset) as f:
 dataset = json.load(f)['data']
 found_answers = 0
 total_answers = 0
 for article in dataset:
 for paragraph in article['paragraphs']:
 for qa in paragraph['qas']:
 for answer in qa['answers']:
 docs = retriever.retrieve(qa['question'],top_k = 20)
 docs_list = get_list_of_passages(docs,top_k = 20)
 for doc in docs_list:
 if doc.find(answer['text']) != -1:
 found_answers += 1
 break
 total_answers += 1
 print("Found answers so far: " + str(found_answers))
 print("Total answers so far: " + str(total_answers))
 print("####################################################")
 print("DONE")
 print("####################################################")
 print("Found answers: " + str(found_answers))
 print("Accuracy is: " + str(found_answers / total_answers))
 return found_answers, total_answers

In [None]:
accuracy_retriever(retriever,'../input/tydiaqatestset/tydiqa-goldp-dev-arabic.json')

In [None]:
# retriever_eval_results['predictions'][0]

In [None]:
# reader_eval_results = reader.eval(document_store=document_store, device=device, label_index=label_index, doc_index=doc_index)
# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
reader_eval_results = reader.eval_on_file("../input/tydiaqatestset/","tydiqa-goldp-dev-arabic.json", device=device)

## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])

In [None]:
from haystack import Pipeline

# Here is the pipeline definition
p = Pipeline()
p.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
p.add_node(component=eval_retriever, name="EvalRetriever", inputs=["ESRetriever"])
p.add_node(component=reader, name="QAReader", inputs=["EvalRetriever"])
p.add_node(component=eval_reader, name="EvalReader", inputs=["QAReader"])
results = []

In [None]:
query_pipeline = Pipeline()
query_pipeline.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
query_pipeline.add_node(component=reader, name="QAReader",inputs=["ESRetriever"])

In [None]:
#query = 'ما هي عاصمة فلسطين ؟'
#res = query_pipeline.run(query=query)
#for ans in res['answers']:
 # print(ans['answer'])

In [None]:
# from haystack.pipeline import ExtractiveQAPipeline
# from haystack.utils import print_answers
# pipe = ExtractiveQAPipeline(reader, retriever)

# # Voilà! Ask a question!
# question = "أين تقع فلسطين ؟"
# prediction = pipe.run(query=question)
# print_answers(prediction)

In [None]:
labels = labels[0:900]

In [None]:
count =0
for l in labels:
 res = p.run(
 query=l.query,
 labels=l,
 params={"index": doc_index, "Retriever": {"top_k": 20}, "Reader": {"top_k": 3}}
 )
 count = count+1
 results.append(res)
 print(count)

In [None]:
# n_queries = len(labels)
eval_retriever.print()
print()
retriever.print_time()
print()
eval_reader.print(mode="reader")
print()
reader.print_time()
print()
eval_reader.print(mode="pipeline")