1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import jieba.analyse
from hanziconv import HanziConv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
'''
科室推荐
'''
def fenci():
df = pd.read_excel('外科.xlsx', sheet_name='外科', na_filter=False)
text = df['titleask'].tolist()
# 中文停词,哈工大停词,四川机器智能实验室,百度停词
# 加医学方面的停词,提高模型的准确性:填写
stopwords = [line.strip() for line in open('../datas/hit_stopwords.txt', 'r', encoding='utf-8').readlines()]
result = []
# for line in text:
# seg_list = jieba.cut(HanziConv.toSimplified(line))
# seg_list = [i for i in seg_list if i not in stopwords]
# result.append(' '.join(seg_list))
for line in text:
if isinstance(line, str):
seg_list = jieba.cut(HanziConv.toSimplified(line))
seg_list = [i for i in seg_list if i not in stopwords]
result.append(' '.join(seg_list))
df_result = pd.DataFrame(result, columns=['ask'])
df_result.to_excel('外科5-14000.xlsx', sheet_name='Sheet2', index=False)
def fencitest():
df = pd.read_excel('测试数据.xlsx', sheet_name='Sheet1', engine='openpyxl')
text = df['test'].tolist()
stopwords = [line.strip() for line in open('../datas/hit_stopwords.txt', 'r', encoding='utf-8').readlines()]
result = []
for line in text:
if isinstance(line, str):
seg_list = jieba.cut(HanziConv.toSimplified(line))
seg_list = [i for i in seg_list if i not in stopwords]
result.append(' '.join(seg_list))
df_result = pd.DataFrame(result, columns=['test'])
df_result.to_excel('测试数据.xlsx', sheet_name='Sheet1', index=False)
def keshi():
# 构建训练集和测试集的数据
df = pd.read_excel('外科5-14000.xlsx', sheet_name='Sheet2')
train_data = df['ask'].values
train_labels = df['label'].values
df1 = pd.read_excel('测试数据.xlsx', sheet_name='Sheet1')
test_data = df1['test'].values
test_labels = []
# 使用TF-IDF向量化文本数据
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)
# 使用KNN算法对训练集进行训练
k = 4 # 选择K值为3
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(train_vectors, train_labels)
# 对测试集进行分类预测
test_predictions = knn_classifier.predict(test_vectors)
print("预测标签:", test_predictions)
if __name__ == '__main__':
# fenci()
# fencitest()
keshi()