# Abstract Dataset

In [91]:
import pandas as pd

# Load the dataset
file_path = 'abstracts.csv'
data = pd.read_csv(file_path)

In [92]:
# Create a summary table of articles in the dataset by discipline, journal, and country of affiliations
summary_table = data.groupby(['Category', 'Journal']).agg(
    Native_English=('Country', lambda x: ', '.join(x[data['Status'] == 'Native'].unique())),
    Non_Native_English=('Country', lambda x: ', '.join(x[data['Status'] == 'Non-Native'].unique())),
    Total_Articles=('Journal', 'size')
).reset_index()

# Rename columns for clarity
summary_table.rename(columns={
    'Category': 'Discipline',
    'Native_English': 'Native English',
    'Non_Native_English': 'Non-Native English',
    'Total_Articles': 'Total Articles (n)'
}, inplace=True)

summary_table


Unnamed: 0,Discipline,Journal,Native English,Non-Native English,Total Articles (n)
0,Interdisciplinary,British Journal of Educational Technology,"USA, UK, Australia, Canada","Taiwan, Iran, China, Turkey",8
1,Interdisciplinary,Computers & Education,"USA, UK","South Korea, Brazil, China",8
2,Interdisciplinary,Education and Information Technologies,"USA, UK, Australia","Indonesia, Iraq, Saudi Arabia, Tunisia",8
3,Social Sciences,International Sociology,"UK, USA, Canada","China, Argentina, Guatemala, South Korea",8
4,Social Sciences,SAGE Open,"Australia, Canada, New Zealand, USA","China, Oman, Thailand, Japan",8
5,Social Sciences,Sociology,"Australia, UK, USA","Hungary, Turkey, China",8
6,Technology & Engineering,ACM Computing Surveys,"Australia, USA, UK","Peru, Jordan, Mexico, Chile",8
7,Technology & Engineering,IEEE Access,"USA, Australia, New Zealand","Japan, South Korea, Egypt, Taiwan",8
8,Technology & Engineering,PeerJ Computer Science,"Canada, UK, USA","Japan, Sudan, Vietnam, Russia",8


# Detection Scores

In [93]:
# Load the dataset
file_path = 'results.csv'
data = pd.read_csv(file_path)

# Remove '%' and convert to float
data['score'] = data['score'].str.rstrip('%').astype(float)
data.head()

Unnamed: 0,id,article,category,status,text,llm,tool,score,label
0,1,1,social,non-native,original,,GPTZero,0.0,human
1,2,2,social,native,original,,GPTZero,0.0,human
2,3,3,interdisciplinary,native,original,,GPTZero,0.0,human
3,4,4,interdisciplinary,non-native,original,,GPTZero,0.0,human
4,5,5,tech&engineering,non-native,original,,GPTZero,1.0,human


## Scenario 1: Original vs AI-Generated

In [94]:
# Add calculated metrics as new columns based on the provided criteria
data['FP'] = (data['text'] == 'original') & (data['label'] != 'human')
data['TN'] = (data['text'] == 'original') & (data['label'] == 'human')
data['FN'] = (data['text'] == 'generated') & (data['label'] != 'ai')
data['TP'] = (data['text'] == 'generated') & (data['label'] == 'ai')


# Convert boolean columns to integers for easier analysis
for col in ['FP', 'FN', 'TP', 'TN']:
    data[col] = data[col].astype(int)

# Display the updated DataFrame with new metrics
data.head()

Unnamed: 0,id,article,category,status,text,llm,tool,score,label,FP,TN,FN,TP
0,1,1,social,non-native,original,,GPTZero,0.0,human,0,1,0,0
1,2,2,social,native,original,,GPTZero,0.0,human,0,1,0,0
2,3,3,interdisciplinary,native,original,,GPTZero,0.0,human,0,1,0,0
3,4,4,interdisciplinary,non-native,original,,GPTZero,0.0,human,0,1,0,0
4,5,5,tech&engineering,non-native,original,,GPTZero,1.0,human,0,1,0,0


In [95]:
# Summarize the calculated metrics
summary = data[['FP', 'FN', 'TP', 'TN']].sum()

# Display the summary
summary_df = summary.to_frame(name="Count").reset_index()
summary_df.rename(columns={"index": "Metric"}, inplace=True)

summary_df

Unnamed: 0,Metric,Count
0,FP,35
1,FN,144
2,TP,288
3,TN,181


### Overall

In [96]:
# Group by tool for overall metrics
grouped_data_tool = data.groupby(['tool']).agg(
    TP=('TP', 'sum'),
    TN=('TN', 'sum'),
    FP=('FP', 'sum'),
    FN=('FN', 'sum')
).reset_index()

# Calculate overall metrics for each tool
grouped_data_tool['Accuracy'] = ((grouped_data_tool['TP'] + grouped_data_tool['TN']) /
                                 (grouped_data_tool['TP'] + grouped_data_tool['TN'] + grouped_data_tool['FP'] + grouped_data_tool['FN'])) * 100

grouped_data_tool['FPR'] = (grouped_data_tool['FP'] / (grouped_data_tool['FP'] + grouped_data_tool['TN'])) * 100
grouped_data_tool['FNR'] = (grouped_data_tool['FN'] / (grouped_data_tool['FN'] + grouped_data_tool['TP'])) * 100

# Handle cases where division by zero might occur
grouped_data_tool.fillna(0, inplace=True)

# Display results
print("Overall Metrics by Tool:")
round(grouped_data_tool,2)


Overall Metrics by Tool:


Unnamed: 0,tool,TP,TN,FP,FN,Accuracy,FPR,FNR
0,DetectGPT,69,49,23,75,54.63,31.94,52.08
1,GPTZero,140,72,0,4,98.15,0.0,2.78
2,ZeroGPT,79,60,12,65,64.35,16.67,45.14


### By Author Status

In [97]:
# Group by tool and status for Native/Non-Native metrics
grouped_data_status = data.groupby(['tool', 'status']).agg(
    TP=('TP', 'sum'),
    TN=('TN', 'sum'),
    FP=('FP', 'sum'),
    FN=('FN', 'sum')
).reset_index()

# Calculate metrics for each tool and status
grouped_data_status['Accuracy'] = ((grouped_data_status['TP'] + grouped_data_status['TN']) /
                                   (grouped_data_status['TP'] + grouped_data_status['TN'] + grouped_data_status['FP'] + grouped_data_status['FN'])) * 100

grouped_data_status['FPR'] = (grouped_data_status['FP'] / (grouped_data_status['FP'] + grouped_data_status['TN'])) * 100
grouped_data_status['FNR'] = (grouped_data_status['FN'] / (grouped_data_status['FN'] + grouped_data_status['TP'])) * 100

# Handle cases where division by zero might occur
grouped_data_status.fillna(0, inplace=True)

round(grouped_data_status,2)


Unnamed: 0,tool,status,TP,TN,FP,FN,Accuracy,FPR,FNR
0,DetectGPT,native,37,26,10,35,58.33,27.78,48.61
1,DetectGPT,non-native,32,23,13,40,50.93,36.11,55.56
2,GPTZero,native,71,36,0,1,99.07,0.0,1.39
3,GPTZero,non-native,69,36,0,3,97.22,0.0,4.17
4,ZeroGPT,native,41,29,7,31,64.81,19.44,43.06
5,ZeroGPT,non-native,38,31,5,34,63.89,13.89,47.22


### By Disciplines

In [98]:
# Group by tool and category for T&E, SS, and Interdisciplinary metrics
grouped_data_category = data.groupby(['tool', 'category']).agg(
    TP=('TP', 'sum'),
    TN=('TN', 'sum'),
    FP=('FP', 'sum'),
    FN=('FN', 'sum')
).reset_index()

# Calculate metrics for each tool and category
grouped_data_category['Accuracy'] = ((grouped_data_category['TP'] + grouped_data_category['TN']) /
                                     (grouped_data_category['TP'] + grouped_data_category['TN'] + grouped_data_category['FP'] + grouped_data_category['FN'])) * 100

grouped_data_category['FPR'] = (grouped_data_category['FP'] / (grouped_data_category['FP'] + grouped_data_category['TN'])) * 100
grouped_data_category['FNR'] = (grouped_data_category['FN'] / (grouped_data_category['FN'] + grouped_data_category['TP'])) * 100

# Handle cases where division by zero might occur
grouped_data_category.fillna(0, inplace=True)
round(grouped_data_category,2)

Unnamed: 0,tool,category,TP,TN,FP,FN,Accuracy,FPR,FNR
0,DetectGPT,interdisciplinary,17,14,10,31,43.06,41.67,64.58
1,DetectGPT,social,25,21,3,23,63.89,12.5,47.92
2,DetectGPT,tech&engineering,27,14,10,21,56.94,41.67,43.75
3,GPTZero,interdisciplinary,45,24,0,3,95.83,0.0,6.25
4,GPTZero,social,47,24,0,1,98.61,0.0,2.08
5,GPTZero,tech&engineering,48,24,0,0,100.0,0.0,0.0
6,ZeroGPT,interdisciplinary,29,18,6,19,65.28,25.0,39.58
7,ZeroGPT,social,27,21,3,21,66.67,12.5,43.75
8,ZeroGPT,tech&engineering,23,21,3,25,61.11,12.5,52.08


### False Accusations

In [99]:
# Filter data for 'original' text
original_data = data[data['text'] == 'original']

# Group by article and calculate FAR and MFAR
article_metrics = original_data.groupby('article')['FP'].agg(['sum', 'count'])
article_metrics['FAR'] = (article_metrics['sum'] > 0).astype(int)
article_metrics['MFAR'] = (article_metrics['sum'] > 1).astype(int)

# Calculate rates
far_mfar = {
    "Metric": ["FAR", "MFAR"],
    "Rate (%)": [
        article_metrics['FAR'].sum() / len(original_data.groupby('article')) * 100,
        article_metrics['MFAR'].sum() / len(original_data.groupby('article')) * 100
    ]
}

# Convert to DataFrame for presentation
far_mfar = pd.DataFrame(far_mfar)
round(far_mfar,2)

Unnamed: 0,Metric,Rate (%)
0,FAR,44.44
1,MFAR,4.17


In [100]:
# Group by status and article, and calculate FAR, and MFAR for status
article_metrics_status = original_data.groupby(['status', 'article'])['FP'].agg(['sum', 'count'])
article_metrics_status['FAR'] = (article_metrics_status['sum'] > 0).astype(int)
article_metrics_status['MFAR'] = (article_metrics_status['sum'] > 1).astype(int)

# Aggregate metrics by status
status_metrics = article_metrics_status.groupby('status').agg(
    FAR=('FAR', 'sum'),
    MFAR=('MFAR', 'sum'),
    Total=('FAR', 'count')
).reset_index()

# Calculate rates for status
status_metrics['FAR (%)'] = (status_metrics['FAR'] / status_metrics['Total']) * 100
status_metrics['MFAR (%)'] = (status_metrics['MFAR'] / status_metrics['Total']) * 100

# Round the results for better presentation
status_metrics = status_metrics.round(2)
status_metrics

Unnamed: 0,status,FAR,MFAR,Total,FAR (%),MFAR (%)
0,native,16,1,36,44.44,2.78
1,non-native,16,2,36,44.44,5.56


In [101]:
# Group by category and article, and calculate FAR, and MFAR for category
article_metrics_category = original_data.groupby(['category', 'article'])['FP'].agg(['sum', 'count'])
article_metrics_category['FAR'] = (article_metrics_category['sum'] > 0).astype(int)
article_metrics_category['MFAR'] = (article_metrics_category['sum'] > 1).astype(int)

# Aggregate metrics by category
category_metrics = article_metrics_category.groupby('category').agg(
    FAR=('FAR', 'sum'),
    MFAR=('MFAR', 'sum'),
    Total=('FAR', 'count')
).reset_index()

# Calculate rates for category
category_metrics['FAR (%)'] = (category_metrics['FAR'] / category_metrics['Total']) * 100
category_metrics['MFAR (%)'] = (category_metrics['MFAR'] / category_metrics['Total']) * 100

# Round the results for better presentation
category_metrics = category_metrics.round(2)
category_metrics

Unnamed: 0,category,FAR,MFAR,Total,FAR (%),MFAR (%)
0,interdisciplinary,15,1,24,62.5,4.17
1,social,6,0,24,25.0,0.0
2,tech&engineering,11,2,24,45.83,8.33


## Scenario 2: AI-Assisted

### Summary Statistics

In [102]:
# Filter the dataset for 'enhanced' text only
enhanced_text_data = data[data['text'] == 'enhanced']

# Calculate summary statistics per tool
summary_stats = enhanced_text_data.groupby('tool')['score'].agg(
    min='min',
    q1=lambda x: x.quantile(0.25),  # Calculate Q1
    median='median',
    q3=lambda x: x.quantile(0.75),  # Calculate Q3
    max='max',
    mean='mean',
    sd='std',
    mode=lambda x: x.mode().iloc[0] if not x.mode().empty else None # Calculate mode
).reset_index()

# Round the statistics for better readability
summary_stats = summary_stats.round(2)
summary_stats

Unnamed: 0,tool,min,q1,median,q3,max,mean,sd,mode
0,DetectGPT,0.0,0.0,80.5,100.0,100.0,52.36,47.4,0.0
1,GPTZero,0.0,2.0,12.5,80.25,100.0,37.65,39.99,100.0
2,ZeroGPT,0.0,0.0,0.0,40.51,100.0,20.92,29.24,0.0


In [103]:
# Filter the dataset for 'enhanced' text only
enhanced_text_data = data[data['text'] == 'enhanced']

# Calculate summary statistics per tool
summary_stats = enhanced_text_data.groupby(['tool','status'])['score'].agg(
    min='min',
    q1=lambda x: x.quantile(0.25),  # Calculate Q1
    median='median',
    q3=lambda x: x.quantile(0.75),  # Calculate Q3
    max='max',
    mean='mean',
    sd='std'
).reset_index()

# Round the statistics for better readability
summary_stats = summary_stats.round(2)
summary_stats

Unnamed: 0,tool,status,min,q1,median,q3,max,mean,sd
0,DetectGPT,native,0.0,0.0,81.5,100.0,100.0,54.4,46.79
1,DetectGPT,non-native,0.0,0.0,80.5,100.0,100.0,50.32,48.24
2,GPTZero,native,0.0,3.0,9.5,58.0,100.0,30.68,35.29
3,GPTZero,non-native,0.0,2.0,22.5,99.25,100.0,44.61,43.33
4,ZeroGPT,native,0.0,0.0,0.0,44.84,100.0,21.91,27.79
5,ZeroGPT,non-native,0.0,0.0,0.0,36.38,100.0,19.94,30.79


In [104]:
# Filter the dataset for 'enhanced' text only
enhanced_text_data = data[data['text'] == 'enhanced']

# Calculate summary statistics per tool
summary_stats = enhanced_text_data.groupby(['tool','category'])['score'].agg(
    min='min',
    q1=lambda x: x.quantile(0.25),  # Calculate Q1
    median='median',
    q3=lambda x: x.quantile(0.75),  # Calculate Q3
    max='max',
    mean='mean',
    sd='std'
).reset_index()

# Round the statistics for better readability
summary_stats = summary_stats.round(2)
summary_stats

Unnamed: 0,tool,category,min,q1,median,q3,max,mean,sd
0,DetectGPT,interdisciplinary,0.0,0.0,83.5,100.0,100.0,51.08,47.81
1,DetectGPT,social,0.0,0.0,79.5,100.0,100.0,52.1,46.92
2,DetectGPT,tech&engineering,0.0,0.0,80.5,100.0,100.0,53.9,48.41
3,GPTZero,interdisciplinary,0.0,3.0,9.0,90.25,100.0,36.69,42.16
4,GPTZero,social,0.0,2.0,14.0,74.25,100.0,35.21,38.12
5,GPTZero,tech&engineering,0.0,2.0,26.5,80.25,100.0,41.04,40.21
6,ZeroGPT,interdisciplinary,0.0,0.0,30.48,52.96,96.64,31.73,28.77
7,ZeroGPT,social,0.0,0.0,0.0,34.72,100.0,20.06,33.15
8,ZeroGPT,tech&engineering,0.0,0.0,0.0,5.85,79.43,10.99,21.3


In [105]:
# Filter the dataset for 'enhanced' text only
enhanced_text_data = data[data['text'] == 'enhanced']

# Calculate summary statistics per tool
summary_stats = enhanced_text_data.groupby(['tool','llm'])['score'].agg(
    min='min',
    q1=lambda x: x.quantile(0.25),  # Calculate Q1
    median='median',
    q3=lambda x: x.quantile(0.75),  # Calculate Q3
    max='max',
    mean='mean',
    sd='std'
).reset_index()

# Round the statistics for better readability
summary_stats = summary_stats.round(2)
summary_stats

Unnamed: 0,tool,llm,min,q1,median,q3,max,mean,sd
0,DetectGPT,ChatGPT,0.0,0.0,0.0,86.25,100.0,29.47,44.89
1,DetectGPT,Gemini,0.0,78.5,91.0,100.0,100.0,75.25,38.03
2,GPTZero,ChatGPT,0.0,1.0,5.0,25.75,100.0,19.79,30.51
3,GPTZero,Gemini,0.0,9.0,58.0,100.0,100.0,55.5,40.55
4,ZeroGPT,ChatGPT,0.0,0.0,0.0,0.0,93.55,10.04,21.5
5,ZeroGPT,Gemini,0.0,0.0,29.34,53.06,100.0,31.8,31.93


### Statistical Tests

In [106]:
from scipy.stats import ttest_ind

# Welch's t-tests (2-sample t-test with unequal variances) for native vs non-native for each tool
welch_t_test_results = []

for tool in enhanced_text_data['tool'].unique():
    tool_data = enhanced_text_data[enhanced_text_data['tool'] == tool]
    native_scores = tool_data[tool_data['status'] == 'native']['score']
    non_native_scores = tool_data[tool_data['status'] == 'non-native']['score']

    # Perform Welch's t-test
    t_stat, p_value = ttest_ind(native_scores, non_native_scores, equal_var=False, nan_policy='omit')

    # Store results
    welch_t_test_results.append({
        'Tool': tool,
        'Statistic': t_stat,
        'p-value': p_value
    })

# Convert results to a DataFrame
welch_t_test_results_df = pd.DataFrame(welch_t_test_results)
welch_t_test_results_df


Unnamed: 0,Tool,Statistic,p-value
0,GPTZero,-2.115188,0.036232
1,ZeroGPT,0.403792,0.68698
2,DetectGPT,0.515602,0.606935


In [107]:
from statsmodels.stats.anova import AnovaRM
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Welch's ANOVA for each tool
anova_results = []

for tool in enhanced_text_data['tool'].unique():
    tool_data = enhanced_text_data[enhanced_text_data['tool'] == tool]

    # Fit the model
    model = ols('score ~ category', data=tool_data).fit()
    anova_table = sm.stats.anova_lm(model, typ=2, robust='hc3')  # Type-II ANOVA with robust HC3
    anova_table['Tool'] = tool
    anova_results.append(anova_table.reset_index())

# Combine results into a single DataFrame
anova_results_df = pd.concat(anova_results, ignore_index=True)

# Display the results
anova_results_df

Unnamed: 0,index,sum_sq,df,F,PR(>F),Tool
0,category,890.956098,2.0,0.275691,0.759456,GPTZero
1,Residual,227836.145833,141.0,,,GPTZero
2,category,12601.843547,2.0,7.9407,0.00054,ZeroGPT
3,Residual,111883.072858,141.0,,,ZeroGPT
4,category,187.158491,2.0,0.041101,0.959743,DetectGPT
5,Residual,321026.625,141.0,,,DetectGPT


In [108]:
# Welch's t-tests (2-sample t-test with unequal variances) for ChatGPT vs Gemini
welch_t_test_results = []

for tool in enhanced_text_data['tool'].unique():
    tool_data = enhanced_text_data[enhanced_text_data['tool'] == tool]
    native_scores = tool_data[tool_data['llm'] == 'ChatGPT']['score']
    non_native_scores = tool_data[tool_data['llm'] == 'Gemini']['score']

    # Perform Welch's t-test
    t_stat, p_value = ttest_ind(native_scores, non_native_scores, equal_var=False, nan_policy='omit')

    # Store results
    welch_t_test_results.append({
        'Tool': tool,
        'Statistic': t_stat,
        'p-value': p_value
    })

# Convert results to a DataFrame
welch_t_test_results_df = pd.DataFrame(welch_t_test_results)
welch_t_test_results_df

Unnamed: 0,Tool,Statistic,p-value
0,GPTZero,-5.971151,2.055113e-08
1,ZeroGPT,-4.796977,4.522249e-06
2,DetectGPT,-6.602028,8.025894e-10


### Density Plots

In [123]:
import pandas as pd
import numpy as np
import plotly.express as px
from scipy.stats import gaussian_kde

# Prepare the data for density estimation by status and tool
density_data = []

for tool in enhanced_text_data['tool'].unique():
    tool_data = enhanced_text_data[enhanced_text_data['tool'] == tool]
    for status in tool_data['status'].unique():
        # Drop NaNs from status_data
        status_data = tool_data[tool_data['status'] == status]['score'].dropna()
        if len(status_data) > 1:  # Ensure there is enough data for density estimation
            density = gaussian_kde(status_data)
            x_vals = np.linspace(status_data.min(), status_data.max(), 100)
            y_vals = density(x_vals)
            density_data.append(pd.DataFrame({
                'Score': x_vals,
                'Density': y_vals,
                'Author Status': status,
                'Tool': tool
            }))

# Combine all density data
density_df = pd.concat(density_data, ignore_index=True)

# Custom color for accessability
custom_colors = ['#66c2a5', '#fc8d62', '#8da0cb']

# Plot density with facets for tools and series for statuses
fig = px.line(
    density_df,
    x="Score",
    y="Density",
    color="Author Status",
    facet_col="Tool",
    title="Density Plot of Scores by Author Status",
    labels={"Score": "Score (%)", "Density": "Density"},
    line_group="Author Status",
    color_discrete_sequence=custom_colors,
    width=1400,
    height=900
)

# Customize facet titles to remove "Tool=" prefix
for annotation in fig.layout.annotations:
    if annotation.text.startswith("Tool="):
        # Remove "Tool=" from the annotation text
        annotation.text = annotation.text.replace("Tool=", "")

fig.update_layout(
    title={
        'text': "AI-Generated Probability Scores by Tool and Author Status",
        'y':0.97,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend_title_text='Author Status',
    template='plotly_white',
    font=dict(size=18)
)

fig.update_traces(line=dict(width=3))

fig.show()

In [124]:
# Prepare the data for density estimation by status and tool
density_data = []

for tool in enhanced_text_data['tool'].unique():
    tool_data = enhanced_text_data[enhanced_text_data['tool'] == tool]
    for category in tool_data['category'].unique():
        category_data = tool_data[tool_data['category'] == category]['score']
        if len(category_data) > 1:  # Ensure there is enough data for density estimation
            density = gaussian_kde(category_data)
            x_vals = np.linspace(category_data.min(), category_data.max(), 100)
            y_vals = density(x_vals)
            density_data.append(pd.DataFrame({
                'Score': x_vals,
                'Density': y_vals,
                'Discipline': category,
                'Tool': tool
            }))

# Combine all density data
density_df = pd.concat(density_data, ignore_index=True)

# Custom color for accessability
custom_colors = ['#66c2a5', '#fc8d62', '#8da0cb']

# Plot density with facets for tools and series for statuses
fig = px.line(
    density_df,
    x="Score",
    y="Density",
    color="Discipline",
    facet_col="Tool",
    title="Density Plot of Scores by Discipline",
    labels={"Score": "Score (%)", "Density": "Density"},
    line_group="Discipline",
    category_orders={"Discipline": ["tech&engineering", "social", "interdisciplinary"]},
    color_discrete_sequence=custom_colors,
    width=1400,
    height=900
)

# Customize facet titles to remove "Tool=" prefix
for annotation in fig.layout.annotations:
    if annotation.text.startswith("Tool="):
        annotation.text = annotation.text.replace("Tool=", "")

fig.update_layout(
    title={
        'text': "AI-Generated Probability Scores by Tool and Discipline",
        'y':0.97,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend_title_text='Discipline',
    template='plotly_white',
    font=dict(size=18)
)

fig.update_traces(line=dict(width=3))

fig.show()


In [125]:
# Prepare the data for density estimation by LLM and tool
density_data = []

for tool in enhanced_text_data['tool'].unique():
    tool_data = enhanced_text_data[enhanced_text_data['tool'] == tool]
    for llm in tool_data['llm'].unique():
        category_data = tool_data[tool_data['llm'] == llm]['score']
        if len(category_data) > 1:  # Ensure there is enough data for density estimation
            density = gaussian_kde(category_data)
            x_vals = np.linspace(category_data.min(), category_data.max(), 100)
            y_vals = density(x_vals)
            density_data.append(pd.DataFrame({
                'Score': x_vals,
                'Density': y_vals,
                'LLM': llm,
                'Tool': tool
            }))

# Combine all density data
density_df = pd.concat(density_data, ignore_index=True)

# Custom color for accessability
custom_colors = ['#66c2a5', '#fc8d62', '#8da0cb']

# Plot density with facets for tools and series for statuses
fig = px.line(
    density_df,
    x="Score",
    y="Density",
    color="LLM",
    facet_col="Tool",
    title="Density Plot of Scores by LLM",
    labels={"Score": "Score (%)", "Density": "Density"},
    line_group="LLM",
    color_discrete_sequence=custom_colors,
    width=1400,
    height=900
)

# Customize facet titles to remove "Tool=" prefix
for annotation in fig.layout.annotations:
    if annotation.text.startswith("Tool="):
        # Remove "Tool=" from the annotation text
        annotation.text = annotation.text.replace("Tool=", "")

fig.update_layout(
    title={
        'text': "AI-Generated Probability Scores by Tool and LLM",
        'y':0.97,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend_title_text='LLM',
    template='plotly_white',
    font=dict(size=18)
)

fig.update_traces(line=dict(width=3))

fig.show()

### Under-Over Detections

In [112]:
# Calculate UDR and ODR for each tool
detectors = ['GPTZero', 'ZeroGPT', 'DetectGPT']
udr_odr_metrics = {}

# Iterate through each detector to calculate UDR and ODR
for detector in detectors:
    # Filter the dataset for enhanced abstracts and specific detector
    detector_data = enhanced_text_data[(enhanced_text_data['text'] == 'enhanced') & (enhanced_text_data['tool'] == detector)]

    # Count the number of AI-Assisted abstracts labeled as 0% and 100%
    udr_count = detector_data[detector_data['score'] == 0].shape[0]
    odr_count = detector_data[detector_data['score'] == 100].shape[0]

    # Total number of AI-Assisted abstracts for the current detector
    total_ai_assisted = detector_data.shape[0]

    # Calculate UDR and ODR
    UDR = (udr_count / total_ai_assisted) * 100 if total_ai_assisted > 0 else None
    ODR = (odr_count / total_ai_assisted) * 100 if total_ai_assisted > 0 else None

    # Store the metrics
    udr_odr_metrics[detector] = {
        'UDR (%)': UDR,
        'ODR (%)': ODR
    }

# Convert to a DataFrame for visualization
udr_odr_df = pd.DataFrame(udr_odr_metrics)

# Display the metrics DataFrame
round(udr_odr_df,2)


Unnamed: 0,GPTZero,ZeroGPT,DetectGPT
UDR (%),3.47,58.33,44.44
ODR (%),18.06,2.08,34.03


In [113]:
# Calculate UDR and ODR by author status
udr_odr_by_status = {}

for detector in detectors:
    # Get the unique values for 'status'
    unique_status_values = enhanced_text_data['status'].unique()

    # For each unique status value, calculate UDR and ODR
    for status_value in unique_status_values:
        # Filter data for 'enhanced' text, the specific detector, and this status
        filtered_data = enhanced_text_data[
            (enhanced_text_data['text'] == 'enhanced') &
            (enhanced_text_data['tool'] == detector) &
            (enhanced_text_data['status'] == status_value)
        ]

        # Count how many are labeled 0% (for UDR) and 100% (for ODR)
        udr_count = filtered_data[filtered_data['score'] == 0].shape[0]
        odr_count = filtered_data[filtered_data['score'] == 100].shape[0]

        # Total AI-Assisted abstracts in this group
        total_ai_assisted = filtered_data.shape[0]

        # Calculate UDR and ODR
        UDR = (udr_count / total_ai_assisted * 100) if total_ai_assisted > 0 else None
        ODR = (odr_count / total_ai_assisted * 100) if total_ai_assisted > 0 else None

        # Initialize the detector's dictionary if not present
        if detector not in udr_odr_by_status:
            udr_odr_by_status[detector] = {}

        # Store results with a label like "status=faculty UDR (%)"
        udr_odr_by_status[detector][f'{status_value} UDR (%)'] = UDR
        udr_odr_by_status[detector][f'{status_value} ODR (%)'] = ODR

# Convert to DataFrame and round to two decimal places
udr_odr_by_status_df = pd.DataFrame(udr_odr_by_status)
round(udr_odr_by_status_df, 2)

Unnamed: 0,GPTZero,ZeroGPT,DetectGPT
non-native UDR (%),2.78,63.89,47.22
non-native ODR (%),25.0,2.78,34.72
native UDR (%),4.17,52.78,41.67
native ODR (%),11.11,1.39,33.33


In [114]:
# Calculate UDR and ODR by category (discipline)
udr_odr_by_category = {}

for detector in detectors:
    # Get the unique values for 'category'
    unique_category_values = enhanced_text_data['category'].unique()

    # For each unique category value, calculate UDR and ODR
    for category_value in unique_category_values:
        # Filter data for 'enhanced' text, the specific detector, and this category
        filtered_data = enhanced_text_data[
            (enhanced_text_data['text'] == 'enhanced') &
            (enhanced_text_data['tool'] == detector) &
            (enhanced_text_data['category'] == category_value)
        ]

        # Count how many are labeled 0% (for UDR) and 100% (for ODR)
        udr_count = filtered_data[filtered_data['score'] == 0].shape[0]
        odr_count = filtered_data[filtered_data['score'] == 100].shape[0]

        # Total AI-Assisted abstracts in this group
        total_ai_assisted = filtered_data.shape[0]

        # Calculate UDR and ODR
        UDR = (udr_count / total_ai_assisted * 100) if total_ai_assisted > 0 else None
        ODR = (odr_count / total_ai_assisted * 100) if total_ai_assisted > 0 else None

        # Initialize the detector's dictionary if not present
        if detector not in udr_odr_by_category:
            udr_odr_by_category[detector] = {}

        # Store results with a label like "category=stem UDR (%)"
        udr_odr_by_category[detector][f'{category_value} UDR (%)'] = UDR
        udr_odr_by_category[detector][f'{category_value} ODR (%)'] = ODR

# Convert to DataFrame and round to two decimal places
udr_odr_by_category_df = pd.DataFrame(udr_odr_by_category)
round(udr_odr_by_category_df, 2)

Unnamed: 0,GPTZero,ZeroGPT,DetectGPT
social UDR (%),4.17,66.67,43.75
social ODR (%),12.5,6.25,29.17
interdisciplinary UDR (%),2.08,33.33,45.83
interdisciplinary ODR (%),25.0,0.0,31.25
tech&engineering UDR (%),4.17,75.0,43.75
tech&engineering ODR (%),16.67,0.0,41.67


In [115]:
# Calculate UDR and ODR by category (llm)
udr_odr_by_category = {}

for detector in detectors:
    # Get the unique values for 'llm'
    unique_llm_values = enhanced_text_data['llm'].unique()

    # For each unique category value, calculate UDR and ODR
    for llm_value in unique_llm_values:
        # Filter data for 'enhanced' text, the specific detector, and this category
        filtered_data = enhanced_text_data[
            (enhanced_text_data['text'] == 'enhanced') &
            (enhanced_text_data['tool'] == detector) &
            (enhanced_text_data['llm'] == llm_value)
        ]

        # Count how many are labeled 0% (for UDR) and 100% (for ODR)
        udr_count = filtered_data[filtered_data['score'] == 0].shape[0]
        odr_count = filtered_data[filtered_data['score'] == 100].shape[0]

        # Total AI-Assisted abstracts in this group
        total_ai_assisted = filtered_data.shape[0]

        # Calculate UDR and ODR
        UDR = (udr_count / total_ai_assisted * 100) if total_ai_assisted > 0 else None
        ODR = (odr_count / total_ai_assisted * 100) if total_ai_assisted > 0 else None

        # Initialize the detector's dictionary if not present
        if detector not in udr_odr_by_category:
            udr_odr_by_category[detector] = {}

        # Store results with a label
        udr_odr_by_category[detector][f'{llm_value} UDR (%)'] = UDR
        udr_odr_by_category[detector][f'{llm_value} ODR (%)'] = ODR

# Convert to DataFrame and round to two decimal places
udr_odr_by_category_df = pd.DataFrame(udr_odr_by_category)
round(udr_odr_by_category_df, 2)

Unnamed: 0,GPTZero,ZeroGPT,DetectGPT
ChatGPT UDR (%),5.56,76.39,69.44
ChatGPT ODR (%),8.33,0.0,22.22
Gemini UDR (%),1.39,40.28,19.44
Gemini ODR (%),27.78,4.17,45.83
