卒論(戸田) の履歴(No.5) | 奥原研究室・Rene研究室ログ

戸田?

卒論

1028

1101

新規性

具体案

プログラム

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extract_representative_texts(cluster_vectors, cluster_center, texts, n_center=5, n_edge=3):
    # クラスタ中心からのユークリッド距離を計算
    distances = euclidean_distances(cluster_vectors, [cluster_center]).flatten()
    sorted_indices = np.argsort(distances)
    
    # 中心に近いテキストを抽出
    center_texts = [texts[i] for i in sorted_indices[:n_center]]
    
    # クラスタ端のテキストも少数選択
    edge_texts = [texts[i] for i in sorted_indices[-n_edge:]]
    
    # 代表的なテキスト集合
    representative_texts = center_texts + edge_texts
    return representative_texts

def extract_important_keywords(representative_texts, top_n=3):
    # TF-IDFで重要語を抽出
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(representative_texts)
    scores = np.array(tfidf_matrix.sum(axis=0)).flatten()
    keywords = np.array(vectorizer.get_feature_names_out())
    
    # 上位の重要語を取得
    top_indices = scores.argsort()[-top_n:]
    return keywords[top_indices]

# クラスタごとに代表文書と重要語を抽出
for i, cluster in enumerate(clusters):
    representative_texts = extract_representative_texts(cluster['vectors'], cluster['center'], cluster['texts'])
    important_keywords = extract_important_keywords(representative_texts)
    print(f"Cluster {i} Keywords:", important_keywords)