from gensim import corpora, models, similarities import re import os from nltk.corpus import stopwords import glob import copy import random import time import json import pickle import nltk import collections from collections import Counter from itertools import combinations import numpy as np from random import shuffle import torch import torch.utils.data import torch.nn.functional as F from tools.logger import * import dgl from dgl.data.utils import save_graphs, load_graphs FILTERWORD = stopwords.words('english') punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\'\'', '\'', '`', '``', '-', '--', '|', '\/','!',',','.','?','-s','-ly','','s','(',')','’','.','i','.i',':','','"'] FILTERWORD.extend(punctuations) ######################################### Example ######################################### class Example(object): """Class representing a train/val/test example for single-document extractive summarization.""" def __init__(self, article_sents, abstract_sents, vocab, sent_max_len, label): """ Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. :param article_sents: list(strings) for single document or list(list(string)) for multi-document; one per article sentence. each token is separated by a single space. :param abstract_sents: list(strings); one per abstract sentence. In each sentence, each token is separated by a single space. :param vocab: Vocabulary object :param sent_max_len: int, max length of each sentence :param label: list, the No of selected sentence, e.g. [1,3,5] """ self.sent_max_len = sent_max_len self.enc_sent_len = [] self.enc_sent_input = [] self.enc_sent_input_pad = [] # Store the original strings self.original_article_sents = article_sents self.original_abstract = "\n".join(abstract_sents) # Process the article if isinstance(article_sents, list) and isinstance(article_sents[0], list): # multi document self.original_article_sents = [] for doc in article_sents: self.original_article_sents.extend(doc) for sent in self.original_article_sents: article_words = sent.split() self.enc_sent_len.append(len(article_words)) # store the length before padding self.enc_sent_input.append([vocab.word2id(w.lower()) for w in article_words]) # list of word ids; OOVs are represented by the id for UNK token self._pad_encoder_input(vocab.word2id('[PAD]')) # Store the label self.label = label label_shape = (len(self.original_article_sents), len(label)) # [N, len(label)] # label_shape = (len(self.original_article_sents), len(self.original_article_sents)) self.label_matrix = np.zeros(label_shape, dtype=int) if label != []: self.label_matrix[np.array(label), np.arange(len(label))] = 1 # label_matrix[i][j]=1 indicate the i-th sent will be selected in j-th step def _pad_encoder_input(self, pad_id): """ :param pad_id: int; token pad id :return: """ max_len = self.sent_max_len for i in range(len(self.enc_sent_input)): article_words = self.enc_sent_input[i].copy() if len(article_words) > max_len: article_words = article_words[:max_len] if len(article_words) < max_len: article_words.extend([pad_id] * (max_len - len(article_words))) self.enc_sent_input_pad.append(article_words) class Example2(Example): """Class representing a train/val/test example for multi-document extractive summarization.""" def __init__(self, article_sents, abstract_sents, vocab, sent_max_len, label): """ Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. :param article_sents: list(list(string)) for multi-document; one per article sentence. each token is separated by a single space. :param abstract_sents: list(strings); one per abstract sentence. In each sentence, each token is separated by a single space. :param vocab: Vocabulary object :param sent_max_len: int, max length of each sentence :param label: list, the No of selected sentence, e.g. [1,3,5] """ super().__init__(article_sents, abstract_sents, vocab, sent_max_len, label) cur = 0 self.original_articles = [] self.article_len = [] self.enc_doc_input = [] for doc in article_sents: if len(doc) == 0: continue docLen = len(doc) self.original_articles.append(" ".join(doc)) self.article_len.append(docLen) self.enc_doc_input.append(catDoc(self.enc_sent_input[cur:cur + docLen])) cur += docLen ######################################### ExampleSet ######################################### class ExampleSet(torch.utils.data.Dataset): """ Constructor: Dataset of example(object) for single document summarization""" def __init__(self, data_path, vocab, doc_max_timesteps, sent_max_len, filter_word_path, w2s_path): """ Initializes the ExampleSet with the path of data :param data_path: string; the path of data :param vocab: object; :param doc_max_timesteps: int; the maximum sentence number of a document, each example should pad sentences to this length :param sent_max_len: int; the maximum token number of a sentence, each sentence should pad tokens to this length :param filter_word_path: str; file path, the file must contain one word for each line and the tfidf value must go from low to high (the format can refer to script/lowTFIDFWords.py) :param w2s_path: str; file path, each line in the file contain a json format data (which can refer to the format can refer to script/calw2sTFIDF.py) """ self.vocab = vocab self.sent_max_len = sent_max_len self.doc_max_timesteps = doc_max_timesteps logger.info("[INFO] Start reading %s", self.__class__.__name__) start = time.time() self.example_list = readJson(data_path) logger.info("[INFO] Finish reading %s. Total time is %f, Total size is %d", self.__class__.__name__, time.time() - start, len(self.example_list)) self.size = len(self.example_list) logger.info("[INFO] Loading filter word File %s", filter_word_path) tfidf_w = readText(filter_word_path) self.filterwords = FILTERWORD self.filterids = [vocab.word2id(w.lower()) for w in FILTERWORD] self.filterids.append(vocab.word2id("[PAD]")) # keep "[UNK]" but remove "[PAD]" lowtfidf_num = 0 pattern = r"^[0-9]+$" for w in tfidf_w: if vocab.word2id(w) != vocab.word2id('[UNK]'): self.filterwords.append(w) self.filterids.append(vocab.word2id(w)) # if re.search(pattern, w) == None: # if w is a number, it will not increase the lowtfidf_num # lowtfidf_num += 1 lowtfidf_num += 1 if lowtfidf_num > 5000: break logger.info("[INFO] Loading word2sent TFIDF file from %s!" % w2s_path) self.w2s_tfidf = readJson(w2s_path) def get_example(self, index): e = self.example_list[index] e["summary"] = e.setdefault("summary", []) example = Example(e["text"], e["summary"], self.vocab, self.sent_max_len, e["label"]) return example def pad_label_m(self, label_matrix): label_m = label_matrix[:self.doc_max_timesteps, :self.doc_max_timesteps] N, m = label_m.shape if m < self.doc_max_timesteps: pad_m = np.zeros((N, self.doc_max_timesteps - m)) return np.hstack([label_m, pad_m]) return label_m def AddWordNode(self, G, inputid): wid2nid = {} nid2wid = {} nid = 0 for sentid in inputid: for wid in sentid: if wid not in self.filterids and wid not in wid2nid.keys(): wid2nid[wid] = nid nid2wid[nid] = wid nid += 1 w_nodes = len(nid2wid) G.add_nodes(w_nodes) G.set_n_initializer(dgl.init.zero_initializer) G.ndata["unit"] = torch.zeros(w_nodes) G.ndata["id"] = torch.LongTensor(list(nid2wid.values())) G.ndata["dtype"] = torch.zeros(w_nodes) return wid2nid, nid2wid def CreateGraph(self, input_pad, label, w2s_w): """ Create a graph for each document :param input_pad: list(list); [sentnum, wordnum] :param label: list(list); [sentnum, sentnum] :param w2s_w: dict(dict) {str: {str: float}}; for each sentence and each word, the tfidf between them :return: G: dgl.DGLGraph node: word: unit=0, dtype=0, id=(int)wordid in vocab sentence: unit=1, dtype=1, words=tensor, position=int, label=tensor edge: word2sent, sent2word: tffrac=int, dtype=0 """ G = dgl.DGLGraph() wid2nid, nid2wid = self.AddWordNode(G, input_pad) w_nodes = len(nid2wid) #print(w_nodes) N = len(input_pad) G.add_nodes(N) G.ndata["unit"][w_nodes:] = torch.ones(N) G.ndata["dtype"][w_nodes:] = torch.ones(N) sentid2nid = [i + w_nodes for i in range(N)] #print(sentid2nid,"fsfcsfcsfcsfsf") if N !=0 : G.add_nodes(3) abc=N+w_nodes G.ndata["dtype"][abc:] = torch.tensor([2.,2.,2.]) G.ndata["unit"][abc:] = torch.tensor([2.,2.,2.]) G.set_e_initializer(dgl.init.zero_initializer) for i in range(N): c = Counter(input_pad[i]) sent_nid = sentid2nid[i] sent_tfw = w2s_w[str(i)] for wid in c.keys(): if wid in wid2nid.keys() and self.vocab.id2word(wid) in sent_tfw.keys(): tfidf = sent_tfw[self.vocab.id2word(wid)] tfidf_box = np.round(tfidf * 9) # box = 10 G.add_edges(wid2nid[wid], sent_nid, data={"tffrac": torch.LongTensor([tfidf_box]), "dtype": torch.Tensor([0])}) G.add_edges(sent_nid, wid2nid[wid], data={"tffrac": torch.LongTensor([tfidf_box]), "dtype": torch.Tensor([0])}) # The two lines can be commented out if you use the code for your own training, since HSG does not use sent2sent edges. # However, if you want to use the released checkpoint directly, please leave them here. # Otherwise it may cause some parameter corresponding errors due to the version differences. G.add_edges(sent_nid, sentid2nid, data={"dtype": torch.ones(N)}) G.add_edges(sentid2nid, sent_nid, data={"dtype": torch.ones(N)}) '''for idi in snode_id: G.add_edges(idi, snode_id1, data={"dtype": torch.Tensor([2])}) G.add_edges(snode_id1, idi, data={"dtype": torch.Tensor([2])}) ''' G.nodes[sentid2nid].data["words"] = torch.LongTensor(input_pad) # [N, seq_len] G.nodes[sentid2nid].data["position"] = torch.arange(1, N + 1).view(-1, 1).long() # [N, 1] G.nodes[sentid2nid].data["label"] = torch.LongTensor(label) # [N, doc_max] snode = G.filter_nodes(lambda nodes: nodes.data["dtype"] == 1) a,b,c=topics(G.nodes[snode].data["words"],self.vocab) snode_id = G.filter_nodes(lambda nodes: nodes.data["dtype"] == 2) cc=torch.LongTensor(c) G.nodes[snode_id].data["words"]=cc G.nodes[snode_id].data["position"] = torch.arange(1, 4).view(-1, 1).long() # [N, 1] for i in range(N): sent_nid1 = sentid2nid[i] feat=b[i] G.add_edges(snode_id[0], sent_nid1, data={"dtype": torch.Tensor([2]),"feat":torch.Tensor([feat[0][1]])}) G.add_edges(sent_nid1, snode_id[0], data={"dtype": torch.Tensor([2]),"feat":torch.Tensor([feat[0][1]])}) G.add_edges(snode_id[1], sent_nid1, data={"dtype": torch.Tensor([2]),"feat":torch.Tensor([feat[1][1]])}) G.add_edges(sent_nid1, snode_id[1], data={"dtype": torch.Tensor([2]),"feat":torch.Tensor([feat[1][1]])}) G.add_edges(snode_id[2], sent_nid1, data={"dtype": torch.Tensor([2]),"feat":torch.Tensor([feat[2][1]])}) G.add_edges(sent_nid1, snode_id[2], data={"dtype": torch.Tensor([2]),"feat":torch.Tensor([feat[2][1]])}) return G def __getitem__(self, index): """ :param index: int; the index of the example :return G: graph for the example index: int; the index of the example in the dataset """ item = self.get_example(index) input_pad = item.enc_sent_input_pad[:self.doc_max_timesteps] label = self.pad_label_m(item.label_matrix) w2s_w = self.w2s_tfidf[index] G = self.CreateGraph(input_pad, label, w2s_w) # print("qwertyuioooooooooooo",G.filter_nodes(lambda nodes: nodes.data["dtype"] == 1),"hhhhhhhhhhhhhhhhh") return G, index def __len__(self): return self.size class MultiExampleSet(ExampleSet): """ Constructor: Dataset of example(object) for multiple document summarization""" def __init__(self, data_path, vocab, doc_max_timesteps, sent_max_len, filter_word_path, w2s_path, w2d_path): """ Initializes the ExampleSet with the path of data :param data_path: string; the path of data :param vocab: object; :param doc_max_timesteps: int; the maximum sentence number of a document, each example should pad sentences to this length :param sent_max_len: int; the maximum token number of a sentence, each sentence should pad tokens to this length :param filter_word_path: str; file path, the file must contain one word for each line and the tfidf value must go from low to high (the format can refer to script/lowTFIDFWords.py) :param w2s_path: str; file path, each line in the file contain a json format data (which can refer to the format can refer to script/calw2sTFIDF.py) :param w2d_path: str; file path, each line in the file contain a json format data (which can refer to the format can refer to script/calw2dTFIDF.py) """ super().__init__(data_path, vocab, doc_max_timesteps, sent_max_len, filter_word_path, w2s_path) logger.info("[INFO] Loading word2doc TFIDF file from %s!" % w2d_path) self.w2d_tfidf = readJson(w2d_path) def get_example(self, index): e = self.example_list[index] e["summary"] = e.setdefault("summary", []) example = Example2(e["text"], e["summary"], self.vocab, self.sent_max_len, e["label"]) return example def MapSent2Doc(self, article_len, sentNum): sent2doc = {} doc2sent = {} sentNo = 0 for i in range(len(article_len)): doc2sent[i] = [] for j in range(article_len[i]): sent2doc[sentNo] = i doc2sent[i].append(sentNo) sentNo += 1 if sentNo >= sentNum: return sent2doc return sent2doc def CreateGraph(self, docLen, sent_pad, doc_pad, label, w2s_w, w2d_w): """ Create a graph for each document :param docLen: list; the length of each document in this example :param sent_pad: list(list), [sentnum, wordnum] :param doc_pad: list, [document, wordnum] :param label: list(list), [sentnum, sentnum] :param w2s_w: dict(dict) {str: {str: float}}, for each sentence and each word, the tfidf between them :param w2d_w: dict(dict) {str: {str: float}}, for each document and each word, the tfidf between them :return: G: dgl.DGLGraph node: word: unit=0, dtype=0, id=(int)wordid in vocab sentence: unit=1, dtype=1, words=tensor, position=int, label=tensor document: unit=1, dtype=2 edge: word2sent, sent2word: tffrac=int, dtype=0 word2doc, doc2word: tffrac=int, dtype=0 sent2doc: dtype=2 """ # add word nodes G = dgl.DGLGraph() wid2nid, nid2wid = self.AddWordNode(G, sent_pad) w_nodes = len(nid2wid) # add sent nodes N = len(sent_pad) G.add_nodes(N) G.ndata["unit"][w_nodes:] = torch.ones(N) G.ndata["dtype"][w_nodes:] = torch.ones(N) sentid2nid = [i + w_nodes for i in range(N)] ws_nodes = w_nodes + N # add doc nodes sent2doc = self.MapSent2Doc(docLen, N) article_num = len(set(sent2doc.values())) G.add_nodes(article_num) G.ndata["unit"][ws_nodes:] = torch.ones(article_num) G.ndata["dtype"][ws_nodes:] = torch.ones(article_num) * 2 docid2nid = [i + ws_nodes for i in range(article_num)] # add sent edges for i in range(N): c = Counter(sent_pad[i]) sent_nid = sentid2nid[i] sent_tfw = w2s_w[str(i)] for wid, cnt in c.items(): if wid in wid2nid.keys() and self.vocab.id2word(wid) in sent_tfw.keys(): tfidf = sent_tfw[self.vocab.id2word(wid)] tfidf_box = np.round(tfidf * 9) # box = 10 # w2s s2w G.add_edge(wid2nid[wid], sent_nid, data={"tffrac": torch.LongTensor([tfidf_box]), "dtype": torch.Tensor([0])}) G.add_edge(sent_nid, wid2nid[wid], data={"tffrac": torch.LongTensor([tfidf_box]), "dtype": torch.Tensor([0])}) # s2d docid = sent2doc[i] docnid = docid2nid[docid] G.add_edge(sent_nid, docnid, data={"tffrac": torch.LongTensor([0]), "dtype": torch.Tensor([2])}) # add doc edges for i in range(article_num): c = Counter(doc_pad[i]) doc_nid = docid2nid[i] doc_tfw = w2d_w[str(i)] for wid, cnt in c.items(): if wid in wid2nid.keys() and self.vocab.id2word(wid) in doc_tfw.keys(): # w2d d2w tfidf = doc_tfw[self.vocab.id2word(wid)] tfidf_box = np.round(tfidf * 9) # box = 10 G.add_edge(wid2nid[wid], doc_nid, data={"tffrac": torch.LongTensor([tfidf_box]), "dtype": torch.Tensor([0])}) G.add_edge(doc_nid, wid2nid[wid], data={"tffrac": torch.LongTensor([tfidf_box]), "dtype": torch.Tensor([0])}) G.nodes[sentid2nid].data["words"] = torch.LongTensor(sent_pad) # [N, seq_len] G.nodes[sentid2nid].data["position"] = torch.arange(1, N + 1).view(-1, 1).long() # [N, 1] G.nodes[sentid2nid].data["label"] = torch.LongTensor(label) # [N, doc_max] return G def __getitem__(self, index): """ :param index: int; the index of the example :return G: graph for the example index: int; the index of the example in the dataset """ item = self.get_example(index) sent_pad = item.enc_sent_input_pad[:self.doc_max_timesteps] enc_doc_input = item.enc_doc_input article_len = item.article_len label = self.pad_label_m(item.label_matrix) G = self.CreateGraph(article_len, sent_pad, enc_doc_input, label, self.w2s_tfidf[index], self.w2d_tfidf[index]) return G, index class LoadHiExampleSet(torch.utils.data.Dataset): def __init__(self, data_root): super().__init__() self.data_root = data_root self.gfiles = [f for f in os.listdir(self.data_root) if f.endswith("graph.bin")] logger.info("[INFO] Start loading %s", self.data_root) def __getitem__(self, index): graph_file = os.path.join(self.data_root, "%d.graph.bin" % index) g, label_dict = load_graphs(graph_file) # print(graph_file) return g[0], index def __len__(self): return len(self.gfiles) ######################################### Tools ######################################### import dgl def catDoc(textlist): res = [] for tlist in textlist: res.extend(tlist) return res def readJson(fname): data = [] with open(fname, encoding="utf-8") as f: for line in f: data.append(json.loads(line)) return data def readText(fname): data = [] with open(fname, encoding="utf-8") as f: for line in f: data.append(line.strip()) return data def graph_collate_fn(samples): ''' :param batch: (G, input_pad) :return: ''' graphs, index = map(list, zip(*samples)) graph_len = [len(g.filter_nodes(lambda nodes: nodes.data["dtype"] == 1)) for g in graphs] # sent node of graph sorted_len, sorted_index = torch.sort(torch.LongTensor(graph_len), dim=0, descending=True) batched_graph = dgl.batch([graphs[idx] for idx in sorted_index]) return batched_graph, [index[idx] for idx in sorted_index] def topics ( input_pad,vocab): #print("123123123123333",input_pad,"wwwwwww") texts1=[[ vocab.id2word(int(w)) for w in yi if w != 0 and w !=1]for yi in input_pad] documents = [[w for w in line if w not in FILTERWORD] for line in texts1] '''for doc in range(len(documents)): if not documents[doc]: documents[doc].extend(['not ','list'])''' #print(documents[doc]) #print(documents) ###texts1 = [doc.lower().split() for doc in ["".join(doc1) for doc1 in documents ] ] ###texts = [[w for w in line if w not in stop_words] for line in texts1] dict = corpora.Dictionary(documents) #自建词典 corpus = [dict.doc2bow(text) for text in documents] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf, id2word=dict, num_topics=3) lsiout=lda.print_topics() topicss=[] for i in range(len(lsiout)): dd=re.sub('[0-9 .* + ” '']','',lsiout[i][1]) #dd=re.sub('[0-9 .* + ”]','',lsiout[i]) strinfo = re.compile('"') b = strinfo.sub(' ',dd) nn=b[1:-1].split(' ') topicss.append(nn) doc_top = lda.get_document_topics(corpus, per_word_topics=False, minimum_probability=0.0) #doc_top = lda.get_document_topics(corpus) enc_sent_input=[] for article_words in topicss: enc_sent_input.append([vocab.word2id(w.lower()) for w in article_words]) for i in range(len(enc_sent_input)): o=len(enc_sent_input[i]) oo=100-o enc_sent_input[i].extend([0 for ii in range(oo) ]) return topicss,doc_top ,enc_sent_input