def SBU(df, target_column, percentage): """ Removes a specific percentage of zeros from streaks greater than the average in a DataFrame. Parameters: df (pd.DataFrame): DataFrame with the data. target_column (str): The name of the target column with the values. percentage (float): Percentage of zeros to remove from streaks greater than the average. Returns: pd.DataFrame: DataFrame with the corresponding rows removed. """ consecutive_zeros_lengths = [] count_consecutive_zeros = 0 indices_to_remove = [] data = df[target_column] # Calculate the lengths of zero streaks for idx, value in enumerate(data.values): if value == 0: count_consecutive_zeros += 1 else: if count_consecutive_zeros > 0: consecutive_zeros_lengths.append((idx - count_consecutive_zeros, count_consecutive_zeros)) count_consecutive_zeros = 0 # Add the final length of consecutive zeros if the sequence ends with zeros if count_consecutive_zeros > 0: consecutive_zeros_lengths.append((len(data) - count_consecutive_zeros, count_consecutive_zeros)) # Calculate statistics if consecutive_zeros_lengths: average_length = sum(length for _, length in consecutive_zeros_lengths) / len(consecutive_zeros_lengths) else: return df # No consecutive zeros # Identify the zeros to remove for start_idx, length in consecutive_zeros_lengths: if length > average_length: num_to_remove = int(length * percentage) indices_to_remove.extend(np.linspace(start_idx, start_idx + length - 1, num_to_remove, dtype=int)) # Create a mask to remove the selected zeros mask = np.ones(len(df), dtype=bool) mask[indices_to_remove] = False new_df = df[mask] return new_df def analyze_target_variable(df, target_variable): # Histogram plt.hist(df[target_variable], alpha=0.7, edgecolor='black', bins=50) plt.xlabel(target_variable) plt.ylabel('Frequency') #plt.title(f'Distribution of target variable data {target_variable}') plt.show() # KDE Plot sns.kdeplot(df[target_variable]) plt.show() # Calculate statistics target_counts = df[target_variable].value_counts().sort_index() total_values = df[target_variable].count() print(f"Total values = {total_values}") zero_values = target_counts.loc[0] if 0 in target_counts.index else 0 positive_values = total_values - zero_values print(f"Total values equal to 0 = {zero_values}") thresholds = [0, 1, 2, 4, 8, 16, 32] for threshold in thresholds: count = target_counts.loc[target_counts.index > threshold].sum() print(f"Values greater than {threshold} = {count}") proportion_zero = zero_values / total_values proportion_positive = positive_values / total_values print(f"Proportion of values equal to 0: {proportion_zero:.2f}") print(f"Proportion of values greater than 0: {proportion_positive:.2f}") ranges = [0.1, 0.5, 1, 0.05, 0.025] for r in ranges: values_between_a_and_b = df[target_variable][(df[target_variable] > 0) & (df[target_variable] <= r)].count() print(f"Total values greater than 0 but less than or equal to {r} = {values_between_a_and_b}")