In [1]:
import pandas as pd
import numpy as np
import openai
import pickle
import os
import tqdm as tqdm
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_KEY")

In [2]:
np.random.seed(42)
random_state=42

## Setting Up Functions

In [3]:
def return_criteria(prompt, show_prompt=False):
    if show_prompt:
        print(prompt)

    memory = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model='gpt-4-1106-preview',
        messages=memory,
        temperature=0,
        top_p=1,
    )
    return response['choices'][0]['message']['content'].strip('.').lower().capitalize()

In [4]:
def format_decision(x):
    return ("Included" if ("included" in x.lower()) else "Excluded" if ("excluded" in x.lower()) else "Undecided")

In [5]:
def loop_through_df(dataset_name, instructions, inclusion_criteria, exclusion_criteria, save_ckpt=True, sample=False, sample_n=3):
    if sample:
        temp_df = pd.read_excel(dataset_name)[:sample_n]
    else:
        temp_df = pd.read_excel(dataset_name)

    reasoning, decisions = [], []
    for ind, row in temp_df.iterrows():
        if (ind % 100 == 0) and ind != 0:
            print(ind)
        try:
            prompt = f"{instructions}\n\nTitle: {row['Title']}\n\nAbstract:\n{row['Abstract']}\n\nInclusion criteria:\n{inclusion_criteria}\n\nExclusion criteria:\n{exclusion_criteria}\n\nDecision:",
            temp_decision = return_criteria(prompt[0], show_prompt=False)
            reasoning.append(temp_decision)
            decisions.append(format_decision(temp_decision))
            if save_ckpt:
                with open(f'checkpoints/reasoning_ckpt.pkl', 'wb') as f:
                    pickle.dump(reasoning, f)
                with open(f'checkpoints/decisions_ckpt.pkl', 'wb') as f:
                    pickle.dump(decisions, f)
        except Exception as e:
            print(f'Error processing row {ind}: {row}\n\n{e}')
            reasoning.append(f'Error in row {ind}')
            decisions.append(f'Error in row {ind}')
            continue

    return reasoning, decisions

In [6]:
instructions = \
"""Instructions:
You are a researcher rigorously screening titles and abstracts of scientific papers for inclusion or exclusion in a review paper. Use the criteria below to inform your decision. Write "included" or "excluded" to indicate your decision. Explain your reasoning step-by-step. Format your answer as such:

Reasoning: {reasoning}
Decision: {included/excluded}"""

In [7]:
def print_array(arr):
    for i in arr:
        print(i)
        print('='*50)

## [CAUTION: CALLING API] Running Sampled Analysis

In [8]:
dataset_name = 'SearchResult-Risa.xlsx'

In [9]:
df = pd.read_excel(dataset_name)
df.shape

(1963, 5)

In [10]:
df.head()

Unnamed: 0,Type,Authors,Year,Title,Abstract
0,Journal Article,Y. Zhang; L. Peng; L. Zhang,2023.0,Research Progress on the Predicting Factors an...,Esophageal cancer is one of the malignant tumo...
1,Journal Article,N. Yoshida; K. Eto; T. Matsumoto; K. Kosumi; Y...,2023.0,Omental Flap Wrapping Around the Esophagogastr...,BACKGROUND: Anastomotic leakage after esophage...
2,Journal Article,Y. Yang; H. Li; X. Chen; J. Qin; Y. Li; Y. She...,2023.0,Comparison of neoadjuvant nab-paclitaxel plus ...,BACKGROUND: This study aimed to compare the fe...
3,Journal Article,K. Yamashita; M. Yamasaki; T. Makino; K. Tanak...,2023.0,Preoperative Comprehensive Geriatric Assessmen...,BACKGROUND: Preoperative risk assessment is im...
4,Journal Article,S.-J. Xu; P.-L. Wang; C. Chen; C.-X. You; R.-Q...,2023.0,Inflammatory and Nutritional Status Influences...,INTRODUCTION: The potential association betwee...


In [11]:
inclusion_criteria = """1. Studies focusing on patients who underwent esophagectomy for cancer. 
2. Studies reporting on perioperative risk factors associated with anastomotic complications (e.g., anastomotic leaks, strictures) following esophagectomy.
3. Studies that assess risk factors for other complications as well (ex. aspiration pneumonia, delayed gastric emptying, malnutrition, dilation, Mendelson syndrome) and prolonged length of stay or hospitalization.
4. Randomized controlled trials (RCTs), cohort studies, case-control studies, or prospective observational studies.
5. Articles published in English language.
6. Studies with adult participants (age â‰¥18 years)."""

exclusion_criteria = """1. Studies not reporting specific perioperative risk factors for anastomotic complications.
2. Studies that do not assess risk factors for other complications as well (ex. aspiration pneumonia, delayed gastric emptying, malnutrition, dilation, Mendelson syndrome) and prolonged length of stay or hospitalization.
3. Case reports, reviews, letters, editorials, and conference abstracts.
4. Studies focusing solely on pediatric populations.
5. Studies with insufficient data or incomplete reporting.
6. Studies not relevant to the investigation of anastomotic complications after esophagectomy.
7. Studies published before a specified date (if applicable).
8. Non-human studies or studies conducted on cadavers.
9. Duplicate publications or multiple reports from the same study."""

In [12]:
sampled_reasoning, sampled_decisions = loop_through_df(dataset_name, instructions,
                                                       inclusion_criteria, exclusion_criteria,
                                                       save_ckpt=False, sample=True, sample_n=3)

In [13]:
print_array(sampled_decisions)

Excluded
Included
Included


## Running On Entire Dataset

In [14]:
import time

In [16]:
# CAUTION: Expensive and time-consuming call to the GPT API
start = time.time()
reasoning_final, decisions_final = loop_through_df(dataset_name, instructions, inclusion_criteria, exclusion_criteria,
                                                   save_ckpt=True, sample=False)
print(f'Time elapsed: {time.time() - start}')

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
Time elapsed: 21130.468897104263


In [18]:
with open('results/reasoning_full.txt', 'w') as f:
    for item in reasoning_final:
        f.write("%s\n" % item)

with open('results/decisions_full.txt', 'w') as f:
    for item in decisions_final:
        f.write("%s\n" % item)

## Saving

In [20]:
df_clean = pd.read_excel(dataset_name)
df_clean.head()

Unnamed: 0,Type,Authors,Year,Title,Abstract
0,Journal Article,Y. Zhang; L. Peng; L. Zhang,2023.0,Research Progress on the Predicting Factors an...,Esophageal cancer is one of the malignant tumo...
1,Journal Article,N. Yoshida; K. Eto; T. Matsumoto; K. Kosumi; Y...,2023.0,Omental Flap Wrapping Around the Esophagogastr...,BACKGROUND: Anastomotic leakage after esophage...
2,Journal Article,Y. Yang; H. Li; X. Chen; J. Qin; Y. Li; Y. She...,2023.0,Comparison of neoadjuvant nab-paclitaxel plus ...,BACKGROUND: This study aimed to compare the fe...
3,Journal Article,K. Yamashita; M. Yamasaki; T. Makino; K. Tanak...,2023.0,Preoperative Comprehensive Geriatric Assessmen...,BACKGROUND: Preoperative risk assessment is im...
4,Journal Article,S.-J. Xu; P.-L. Wang; C. Chen; C.-X. You; R.-Q...,2023.0,Inflammatory and Nutritional Status Influences...,INTRODUCTION: The potential association betwee...


In [21]:
df_clean['Reasoning'] = reasoning_final
df_clean['Decision'] = decisions_final
df_clean.head()

Unnamed: 0,Type,Authors,Year,Title,Abstract,Reasoning,Decision
0,Journal Article,Y. Zhang; L. Peng; L. Zhang,2023.0,Research Progress on the Predicting Factors an...,Esophageal cancer is one of the malignant tumo...,Reasoning: the abstract indicates that the pap...,Excluded
1,Journal Article,N. Yoshida; K. Eto; T. Matsumoto; K. Kosumi; Y...,2023.0,Omental Flap Wrapping Around the Esophagogastr...,BACKGROUND: Anastomotic leakage after esophage...,Reasoning: the paper's title and abstract indi...,Included
2,Journal Article,Y. Yang; H. Li; X. Chen; J. Qin; Y. Li; Y. She...,2023.0,Comparison of neoadjuvant nab-paclitaxel plus ...,BACKGROUND: This study aimed to compare the fe...,Reasoning: the study focuses on patients who u...,Included
3,Journal Article,K. Yamashita; M. Yamasaki; T. Makino; K. Tanak...,2023.0,Preoperative Comprehensive Geriatric Assessmen...,BACKGROUND: Preoperative risk assessment is im...,Reasoning: the paper's title and abstract indi...,Included
4,Journal Article,S.-J. Xu; P.-L. Wang; C. Chen; C.-X. You; R.-Q...,2023.0,Inflammatory and Nutritional Status Influences...,INTRODUCTION: The potential association betwee...,Reasoning: the study focuses on patients who u...,Included


In [22]:
df_clean.to_csv('results/results_full.csv', index=False)

In [23]:
df_clean.Decision.value_counts()

Excluded    1154
Included     809
Name: Decision, dtype: int64

In [24]:
df_final = pd.read_csv('results/results_full.csv')
df_final.Decision.value_counts()

Excluded    1154
Included     809
Name: Decision, dtype: int64