import pandas as pd import jieba.analyse from hanziconv import HanziConv from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neighbors import KNeighborsClassifier ''' 科室推荐 ''' def fenci(): df = pd.read_excel('外科.xlsx', sheet_name='外科', na_filter=False) text = df['titleask'].tolist() # 中文停词,哈工大停词,四川机器智能实验室,百度停词 # 加医学方面的停词,提高模型的准确性:填写 stopwords = [line.strip() for line in open('../datas/hit_stopwords.txt', 'r', encoding='utf-8').readlines()] result = [] # for line in text: # seg_list = jieba.cut(HanziConv.toSimplified(line)) # seg_list = [i for i in seg_list if i not in stopwords] # result.append(' '.join(seg_list)) for line in text: if isinstance(line, str): seg_list = jieba.cut(HanziConv.toSimplified(line)) seg_list = [i for i in seg_list if i not in stopwords] result.append(' '.join(seg_list)) df_result = pd.DataFrame(result, columns=['ask']) df_result.to_excel('外科5-14000.xlsx', sheet_name='Sheet2', index=False) def fencitest(): df = pd.read_excel('测试数据.xlsx', sheet_name='Sheet1', engine='openpyxl') text = df['test'].tolist() stopwords = [line.strip() for line in open('../datas/hit_stopwords.txt', 'r', encoding='utf-8').readlines()] result = [] for line in text: if isinstance(line, str): seg_list = jieba.cut(HanziConv.toSimplified(line)) seg_list = [i for i in seg_list if i not in stopwords] result.append(' '.join(seg_list)) df_result = pd.DataFrame(result, columns=['test']) df_result.to_excel('测试数据.xlsx', sheet_name='Sheet1', index=False) def keshi(): # 构建训练集和测试集的数据 df = pd.read_excel('外科5-14000.xlsx', sheet_name='Sheet2') train_data = df['ask'].values train_labels = df['label'].values df1 = pd.read_excel('测试数据.xlsx', sheet_name='Sheet1') test_data = df1['test'].values test_labels = [] # 使用TF-IDF向量化文本数据 vectorizer = TfidfVectorizer() train_vectors = vectorizer.fit_transform(train_data) test_vectors = vectorizer.transform(test_data) # 使用KNN算法对训练集进行训练 k = 4 # 选择K值为3 knn_classifier = KNeighborsClassifier(n_neighbors=k) knn_classifier.fit(train_vectors, train_labels) # 对测试集进行分类预测 test_predictions = knn_classifier.predict(test_vectors) print("预测标签:", test_predictions) if __name__ == '__main__': # fenci() # fencitest() keshi()