{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# MN_C\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from IPython.display import clear_output" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "\n", "import numpy as np\n", "from sklearn.metrics.cluster import normalized_mutual_info_score\n", "\n", "import date_pornire\n", "from bisect import bisect\n", "\n", "\n", "import networkx as nx\n", "from networkx.algorithms.community import modularity\n", "from networkx.utils import py_random_state\n", "\n", "import pandas as pd\n", "import itertools\n", "from pingouin import ttest" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import mak1 as mk" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def reduce(lista, Linii_unice_Noduri, G, clustere, cati, clase):\n", " lista_noua=lista.copy()\n", " cati_singuri=0\n", " for ix,cluster in enumerate(clustere[np.argsort(cati)[:-clase]]):\n", " cu_cine=0\n", " potriviri=np.zeros(len(Linii_unice_Noduri))\n", " potriviri_noduri=np.zeros(len(Linii_unice_Noduri))\n", " cluster=int(cluster)\n", " for celalalt_cluster in clustere[np.argsort(cati)][ix+1:]:\n", " celalalt_cluster=int(celalalt_cluster)\n", " for i in range(Linii_unice_Noduri.shape[1]-1): \n", " if (Linii_unice_Noduri[cluster,i]== Linii_unice_Noduri[celalalt_cluster,i])&( Linii_unice_Noduri[cluster,i+1]== Linii_unice_Noduri[celalalt_cluster,i+1]):\n", " potriviri[celalalt_cluster]+=G.get_edge_data(Linii_unice_Noduri[cluster,i], Linii_unice_Noduri[cluster,i+1])['weight'] \n", " potriviri_noduri[celalalt_cluster]=len(set.intersection(set(Linii_unice_Noduri[cluster]), set(Linii_unice_Noduri[celalalt_cluster])))\n", " if np.sum(potriviri)!=0:\n", " cu_cine=np.argmax(potriviri) \n", " else:\n", " cu_cine=np.argmax(potriviri_noduri) if np.sum(potriviri_noduri)!=0 else cluster\n", " cati_singuri+=1\n", " # print(cluster, cu_cine, potriviri, potriviri_noduri)\n", " lista_noua[lista==cluster]=cu_cine\n", " return lista_noua, np.sum(np.abs(lista-lista_noua))==0, cati_singuri" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "\n", "def MNC(X,Y,clase):\n", " Lista=[]\n", " print(clase)\n", " Noduri=mk.TabelNoduriIntervale(X, clase)\n", " # Noduri=mk.TabelNoduriKMeans(X,clase)\n", " lista=np.zeros(X.shape[0])\n", " unire=True\n", " for index,linie in enumerate(np.unique(Noduri, axis=0)):\n", " # print((Noduri == linie).all(axis=1))\n", " lista[(Noduri == linie).all(axis=1)]=index\n", " G=mk.CreezGraf1(Noduri)\n", " \n", " # \n", " \n", " clustere, cati=np.unique(lista, return_counts=True)\n", " # if np.all(cati==1):\n", " # lista=daca_sunt_toti_separati(Noduri)\n", " Linii_unice_Noduri=np.unique(Noduri, axis=0)\n", " lista_noua=lista.copy()\n", " Lista.append(lista_noua)\n", " print(len(np.unique(lista_noua)))\n", " neschimbat=False\n", " while (len(np.unique(lista_noua))>clase) and (not neschimbat):\n", " print('aici', len(np.unique(lista_noua)), clase)\n", " clustere, cati=np.unique(lista_noua, return_counts=True)\n", " lista_noua,neschimbat, cati_singuri=reduce(lista_noua, Linii_unice_Noduri, G, clustere, cati, clase)\n", " print(len(np.unique(lista_noua)), normalized_mutual_info_score(lista_noua, Y), not neschimbat)\n", " print(((len(np.unique(lista_noua)>clase)) and (not neschimbat)))\n", " Lista.append(lista_noua)\n", " # neschimbat=True\n", " # Lista.reverse()\n", " # for lista in Lista:\n", " # if len(np.unique(lista))>clase:\n", " # lista_noua=lista.copy()\n", " # break\n", " return normalized_mutual_info_score(lista_noua, Y), len(np.unique(lista_noua)), normalized_mutual_info_score(Lista[0], Y) ,mk.ei_kmeans(X,Y,clase), mk.ei_gm(X,Y,clase), mk.ei_AffinityPropagation(X,Y,clase), mk.ei_Birch(X,Y,clase), mk.ei_MeanShift(X,Y,clase)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# some data\n", "number_of_clusters_in_data=20\n", "X,Y,skf=date_pornire.date(100, 30 , nr_clase = number_of_clusters_in_data, \n", " seed_set_data = 1, pondere_set =[1/number_of_clusters_in_data]*number_of_clusters_in_data, class_sepa=1, \n", " nrfolds=10 )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MNC(X,Y,number_of_clusters_in_data)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# multiple experiments synthetic data\n", "Number_of_instances = [100, 200, 500, 1000,2000]\n", "Number_of_attributes = [30, 50, 100, 150,200,250,1000]\n", "\n", "crt_seed_data = 50\n", "\n", "Class_separator = [0.1, 0.5, 1]\n", "Number_of_classes =[30, 50, 100, 150,200,250,1000]\n", "crt_seed_fold = 60" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Parametrii=[Number_of_instances, Number_of_attributes, Class_separator, Number_of_classes]\n", "data=pd.DataFrame(columns=['data','rulare','instante','atribute','suprapunere', 'clase', 'noi_nmi','cati','noi_max','kmeans_nmi', 'kmeans_count','gm_nmi','gm_count','ap_nmi','ap_count', 'bi_nmi','bi_count', 'ms_nmi','ms_count' ])\n", "\n", "for nr_data, parametrii in enumerate(itertools.product(*Parametrii)):\n", " if nr_data>=0:\n", "\n", " instante=parametrii[0]\n", " atribute=parametrii[1]\n", " suprapunere=parametrii[2]\n", " nr_clase=parametrii[3]\n", " # if (nr_clase<=atribute) and (nr_clase20000):\n", " ebun=False\n", " if ebun:\n", " coloane=df.select_dtypes('object').columns\n", " for col in coloane:\n", " df[col]=pd.factorize(df[col])[0]\n", " X=np.array(df.iloc[:,:-1])\n", " Y=np.array(df.Class)\n", " nr_clase=len(np.unique(Y))\n", " noi, cati,noi_max, (km, km_c), (gm,gm_c), (ap,ap_c), (bi,bi_c),( ms, ms_c) = MNC(X,Y,nr_clase)\n", " rez.loc[len(rez)]=[tabel, X.shape[0], X.shape[1], len(np.unique(Y)), noi,cati, noi_max, km, km_c, gm,gm_c, ap,ap_c, bi,bi_c, ms, ms_c ]\n", " rez.to_csv('reale.csv')\n", " clear_output(wait=True)\n", "print('🐢') " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "12fb2be87a819bf26d890c12da0d95a690ab9d5f61cf85258facb8e8ace66f7a" } } }, "nbformat": 4, "nbformat_minor": 2 }