當(dāng)前位置：首頁(yè) > 编程资源 > 编程问答 >内容正文

编程问答

自动摘要

發(fā)布時(shí)間：2024/7/5 编程问答 30 豆豆

生活随笔收集整理的這篇文章主要介紹了自动摘要小編覺得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.

#!/user/bin/python
# coding:utf-8

import nltk
import numpy
import jieba
import codecs
import os

class SummaryTxt:
? ? def __init__(self,stopwordspath):
? ? ? ? # 單詞數(shù)量
? ? ? ? self.N = 100
? ? ? ? # 單詞間的距離
? ? ? ? self.CLUSTER_THRESHOLD = 5
? ? ? ? # 返回的top n句子
? ? ? ? self.TOP_SENTENCES = 5
? ? ? ? self.stopwrods = {}
? ? ? ? print('???')
? ? ? ? #加載停用詞
? ? ? ? if os.path.exists(stopwordspath):
? ? ? ? ? ? print('!!!!')
? ? ? ? ? ? stoplist = [line.strip() for line in codecs.open(stopwordspath, 'r', encoding='utf8').readlines()]
? ? ? ? ? ? self.stopwrods = {}.fromkeys(stoplist)

? ? def _split_sentences(self,texts):
? ? ? ? '''
? ? ? ? 把texts拆分成單個(gè)句子，保存在列表里面，以（.!?。！？）這些標(biāo)點(diǎn)作為拆分的意見，
? ? ? ? :param texts: 文本信息
? ? ? ? :return:
? ? ? ? '''
? ? ? ? splitstr = '.!?。！？'.encode('utf8').decode('utf8')
? ? ? ? start = 0
? ? ? ? index = 0 ?# 每個(gè)字符的位置
? ? ? ? sentences = []
? ? ? ? for text in texts:
? ? ? ? ? ? if text in splitstr: ?# 檢查標(biāo)點(diǎn)符號(hào)下一個(gè)字符是否還是標(biāo)點(diǎn)
? ? ? ? ? ? ? ? sentences.append(texts[start:index + 1]) ?# 當(dāng)前標(biāo)點(diǎn)符號(hào)位置
? ? ? ? ? ? ? ? start = index + 1 ?# start標(biāo)記到下一句的開頭
? ? ? ? ? ? index += 1
? ? ? ? if start < len(texts):
? ? ? ? ? ? sentences.append(texts[start:]) ?# 這是為了處理文本末尾沒有標(biāo)

? ? ? ? return sentences

? ? def _score_sentences(self,sentences, topn_words):
? ? ? ? '''
? ? ? ? 利用前N個(gè)關(guān)鍵字給句子打分
? ? ? ? :param sentences: 句子列表
? ? ? ? :param topn_words: 關(guān)鍵字列表
? ? ? ? :return:
? ? ? ? '''
? ? ? ? scores = []
? ? ? ? sentence_idx = -1
? ? ? ? for s in [list(jieba.cut(s)) for s in sentences]:
? ? ? ? ? ? sentence_idx += 1
? ? ? ? ? ? word_idx = []
? ? ? ? ? ? for w in topn_words:
? ? ? ? ? ? ? ? try:
? ? ? ? ? ? ? ? ? ? word_idx.append(s.index(w)) ?# 關(guān)鍵詞出現(xiàn)在該句子中的索引位置
? ? ? ? ? ? ? ? except ValueError: ?# w不在句子中
? ? ? ? ? ? ? ? ? ? pass
? ? ? ? ? ? word_idx.sort()
? ? ? ? ? ? if len(word_idx) == 0:
? ? ? ? ? ? ? ? continue
? ? ? ? ? ? # 對(duì)于兩個(gè)連續(xù)的單詞，利用單詞位置索引，通過距離閥值計(jì)算族
? ? ? ? ? ? clusters = []
? ? ? ? ? ? cluster = [word_idx[0]]
? ? ? ? ? ? i = 1
? ? ? ? ? ? while i < len(word_idx):
? ? ? ? ? ? ? ? if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD:
? ? ? ? ? ? ? ? ? ? cluster.append(word_idx[i])
? ? ? ? ? ? ? ? else:
? ? ? ? ? ? ? ? ? ? clusters.append(cluster[:])
? ? ? ? ? ? ? ? ? ? cluster = [word_idx[i]]
? ? ? ? ? ? ? ? i += 1
? ? ? ? ? ? clusters.append(cluster)
? ? ? ? ? ? # 對(duì)每個(gè)族打分，每個(gè)族類的最大分?jǐn)?shù)是對(duì)句子的打分
? ? ? ? ? ? max_cluster_score = 0
? ? ? ? ? ? for c in clusters:
? ? ? ? ? ? ? ? significant_words_in_cluster = len(c)
? ? ? ? ? ? ? ? total_words_in_cluster = c[-1] - c[0] + 1
? ? ? ? ? ? ? ? score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster
? ? ? ? ? ? ? ? if score > max_cluster_score:
? ? ? ? ? ? ? ? ? ? max_cluster_score = score
? ? ? ? ? ? scores.append((sentence_idx, max_cluster_score))
? ? ? ? return scores

? ? def summaryScoredtxt(self,text):
? ? ? ? # 將文章分成句子
? ? ? ? sentences = self._split_sentences(text)

? ? ? ? # 生成分詞
? ? ? ? words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
? ? ? ? ? ? ? ? ?len(w) > 1 and w != '\t']
? ? ? ? # words = []
? ? ? ? # for sentence in sentences:
? ? ? ? # ? ? for w in jieba.cut(sentence):
? ? ? ? # ? ? ? ? if w not in stopwords and len(w) > 1 and w != '\t':
? ? ? ? # ? ? ? ? ? ? words.append(w)

? ? ? ? # 統(tǒng)計(jì)詞頻
? ? ? ? wordfre = nltk.FreqDist(words)

? ? ? ? # 獲取詞頻最高的前N個(gè)詞
? ? ? ? topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

? ? ? ? # 根據(jù)最高的n個(gè)關(guān)鍵詞，給句子打分
? ? ? ? scored_sentences = self._score_sentences(sentences, topn_words)

? ? ? ? # 利用均值和標(biāo)準(zhǔn)差過濾非重要句子
? ? ? ? avg = numpy.mean([s[1] for s in scored_sentences]) ?# 均值
? ? ? ? std = numpy.std([s[1] for s in scored_sentences]) ?# 標(biāo)準(zhǔn)差
? ? ? ? summarySentences = []
? ? ? ? for (sent_idx, score) in scored_sentences:
? ? ? ? ? ? if score > (avg + 0.5 * std):
? ? ? ? ? ? ? ? summarySentences.append(sentences[sent_idx])
? ? ? ? ? ? ? ? print (sentences[sent_idx])
? ? ? ? return summarySentences

? ? def summaryTopNtxt(self,text):
? ? ? ? # 將文章分成句子
? ? ? ? sentences = self._split_sentences(text)

? ? ? ? # 根據(jù)句子列表生成分詞列表
? ? ? ? words = [w for sentence in sentences for w in jieba.cut(sentence) if w not in self.stopwrods if
? ? ? ? ? ? ? ? ?len(w) > 1 and w != '\t']
? ? ? ? # words = []
? ? ? ? # for sentence in sentences:
? ? ? ? # ? ? for w in jieba.cut(sentence):
? ? ? ? # ? ? ? ? if w not in stopwords and len(w) > 1 and w != '\t':
? ? ? ? # ? ? ? ? ? ? words.append(w)

? ? ? ? # 統(tǒng)計(jì)詞頻
? ? ? ? wordfre = nltk.FreqDist(words)

? ? ? ? # 獲取詞頻最高的前N個(gè)詞
? ? ? ? topn_words = [w[0] for w in sorted(wordfre.items(), key=lambda d: d[1], reverse=True)][:self.N]

? ? ? ? # 根據(jù)最高的n個(gè)關(guān)鍵詞，給句子打分
? ? ? ? scored_sentences = self._score_sentences(sentences, topn_words)

? ? ? ? top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-self.TOP_SENTENCES:]
? ? ? ? top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
? ? ? ? summarySentences = []
? ? ? ? for (idx, score) in top_n_scored:
? ? ? ? ? ? print (sentences[idx])
? ? ? ? ? ? summarySentences.append(sentences[idx])

? ? ? ? return sentences

if __name__=='__main__':
? ? obj =SummaryTxt('E:\comments\cn_stopwords.txt')
? ? with open('E:\comments\data.txt',"r") as f: ? ?#設(shè)置文件對(duì)象
? ? ? ? txt= f.read() ? ?#可以是隨便對(duì)文件的操作
? ? print (txt)
? ? print ("--")
? ? obj.summaryScoredtxt(txt)

? ? print ("----")
? ? obj.summaryTopNtxt(txt)

總結(jié)

以上是生活随笔為你收集整理的自动摘要的全部?jī)?nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯(cuò)，歡迎將生活随笔推薦給好友。

摘要

上一篇：判断平台是windows还是linux，
下一篇：朵唯机器人怎么连网_平遥古城推出机器人导