Commit e03f09ee authored by lichusong's avatar lichusong

0716-word2Vec

parent fe23f998
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
import pandas as pd
# Load the pre-trained Word2Vec model
model = Word2Vec.load('../demo/test.model')
# Load the Excel file
df = pd.read_excel('消化内科对话_已解决问题0510.xlsx', sheet_name='w')
# Extract the title and doctor name columns data
titles = df['标题'].tolist()
doctors = df['医生姓名'].tolist()
# Input sentences
# sentences = ["请 彩超 报告 分析", "非 萎缩 胃窦炎 伴 糜烂", "脾胃 湿热 消化不良 有时候 脚 冰凉 小便 次"]
# 读取Excel文件
df = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1')
sentences = df['ask'].tolist()
# Initialize the list of lists to store doctors
doctors_lists = [[] for _ in range(len(sentences))]
for i, sentence in enumerate(sentences):
# Preprocess the sentence
preprocessed_sentence = simple_preprocess(sentence)
# Calculate similarity between the sentence and titles
similarities = []
for title, doctor in zip(titles, doctors):
# Convert title and doctor to string
title = str(title)
doctor = str(doctor)
preprocessed_title = simple_preprocess(title)
# Check if either list is empty
if not preprocessed_sentence or not preprocessed_title:
continue
similarity = model.wv.n_similarity(preprocessed_sentence, preprocessed_title)
# Exclude similarity of 1
if similarity == 1.0:
continue
similarities.append((similarity, title, doctor))
# Sort similarities in descending order
similarities.sort(reverse=True)
# Get the top ten similarities while excluding same titles
top_similarities = []
seen_titles = set()
for similarity, title, doctor in similarities:
if title not in seen_titles:
top_similarities.append((similarity, title, doctor))
seen_titles.add(title)
if len(top_similarities) == 10:
break
# Store the doctors in the respective list
doctors_list = [doctor for _, _, doctor in top_similarities]
doctors_lists[i] = doctors_list
# Print the list of lists of doctors
# print("医生名称数组:", doctors_lists)
test_df = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1')
# Extract the "doctor" column data from the test dataset
test_doctors = test_df['doctor'].tolist()
# Calculate accuracy
correct_predictions = 0
total_predictions = len(doctors_lists)
for i, doctors in enumerate(doctors_lists):
if i < len(test_doctors) and len(doctors) > 0 and test_doctors[i] in doctors:
correct_predictions += 1
# 准确率
accuracy = correct_predictions / total_predictions
print("准确率:", accuracy)
# 测试集
excel_file = '导医测试数据.xlsx'
# sheet_name = 'Sheet1'
sheet_name = 'Sheet1'
df = pd.read_excel(excel_file, sheet_name=sheet_name)
test_data = df['doctor'].tolist()
test_data1 = [[doctor] for doctor in test_data]
# 平均准确率 doctors_lists是推荐医生集,
# 平均精准度
def calculate_average_precision(candidate_set, test_set):
total_precision = 0.0
relevant_count = 0
for i in range(len(test_set)):
query = test_set[i]
candidates = candidate_set[i]
position = 0
for j in range(len(candidates)):
position += 1
if query in candidates[j]:
precision = 1.0 / position
total_precision += precision
relevant_count += 1
break
if relevant_count > 0:
return total_precision / len(candidate_set)
else:
return 0.0
average_precision = calculate_average_precision(doctors_lists, test_data)
print(f"平均精准度: {average_precision:.2%}")
This diff is collapsed.
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import pandas as pd
from jieba import posseg
# 读取Excel文件
data = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1')
# 获取标题列数据
titles = data['ask'].tolist()
with open('stopwords.txt', 'r', encoding='utf-8') as file:
stopwords = file.read().splitlines()
segmented_titles = []
for title in titles:
words = posseg.cut(title)
filtered_words = [word for word, flag in words if word not in stopwords]
segmented_title = ' '.join(filtered_words)
segmented_titles.append(segmented_title)
# 保存分词后的数据到文件
with open('data1.txt', 'w', encoding='utf-8') as file:
for title in segmented_titles:
file.write(title + '\n')
# model = Word2Vec(LineSentence(open('data.txt', 'r', encoding='utf8')), sg=0, vector_size=20, window=5, min_count=1, workers=4)
#
# # 词向量保存
# model.wv.save_word2vec_format('data.vector', binary=False)
#
# # 模型保存
# model.save('test.model')
\ No newline at end of file
Word2vec
FastText
GloVe
ELMO
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment