""" KNN """ import pandas as pd import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split import time start_time = time.time() # 读取数据 df = pd.read_excel('../Result/科室分词结果.xlsx', sheet_name='Sheet1') # 取出标注列和分词列 labels = df['title'].tolist() texts = df['分词'].tolist() # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42) # 计算TF-IDF特征(向量化) vectorizer = TfidfVectorizer() X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) # 训练KNN分类器 knn = KNeighborsClassifier(n_neighbors=46) knn.fit(X_train_tfidf, y_train) # 预测测试集 y_pred = knn.predict(X_test_tfidf) # 计算准确率 accuracy = accuracy_score(y_test, y_pred) end_time = time.time() print('Accuracy:', accuracy) print("代码运行时间为:", end_time - start_time, "秒")