0716-word2Vec

e03f09ee · lichusong · fe23f998 · fe23f998 · e03f09ee · e03f09ee
Commit e03f09ee authored Jul 16, 2023 by lichusong
13 changed files
--- a/word2vec/GloVe.py
+++ b/word2vec/GloVe.py
--- a/word2vec/data.txt
+++ b/word2vec/data.txt
--- a/word2vec/data.vector
+++ b/word2vec/data.vector
--- a/word2vec/data1.txt
+++ b/word2vec/data1.txt
--- a/word2vec/stopwords.txt
+++ b/word2vec/stopwords.txt
--- a/word2vec/test.model
+++ b/word2vec/test.model
--- a/word2vec/word2vec.py
+++ b/word2vec/word2vec.py
+from gensim.utils import simple_preprocess
+from gensim.models import Word2Vec
+import pandas as pd
+
+# Load the pre-trained Word2Vec model
+model = Word2Vec.load('../demo/test.model')
+
+# Load the Excel file
+df = pd.read_excel('消化内科对话_已解决问题0510.xlsx', sheet_name='w')
+
+# Extract the title and doctor name columns data
+titles = df['标题'].tolist()
+doctors = df['医生姓名'].tolist()
+
+# Input sentences
+# sentences = ["请 彩超 报告 分析", "非 萎缩 胃窦炎 伴 糜烂", "脾胃 湿热 消化不良 有时候 脚 冰凉 小便 次"]
+# 读取Excel文件
+df = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1')
+sentences = df['ask'].tolist()
+
+# Initialize the list of lists to store doctors
+doctors_lists = [[] for _ in range(len(sentences))]
+
+for i, sentence in enumerate(sentences):
+    # Preprocess the sentence
+    preprocessed_sentence = simple_preprocess(sentence)
+
+    # Calculate similarity between the sentence and titles
+    similarities = []
+    for title, doctor in zip(titles, doctors):
+        # Convert title and doctor to string
+        title = str(title)
+        doctor = str(doctor)
+
+        preprocessed_title = simple_preprocess(title)
+
+        # Check if either list is empty
+        if not preprocessed_sentence or not preprocessed_title:
+            continue
+
+        similarity = model.wv.n_similarity(preprocessed_sentence, preprocessed_title)
+
+        # Exclude similarity of 1
+        if similarity == 1.0:
+            continue
+
+        similarities.append((similarity, title, doctor))
+
+    # Sort similarities in descending order
+    similarities.sort(reverse=True)
+
+    # Get the top ten similarities while excluding same titles
+    top_similarities = []
+    seen_titles = set()
+    for similarity, title, doctor in similarities:
+        if title not in seen_titles:
+            top_similarities.append((similarity, title, doctor))
+            seen_titles.add(title)
+        if len(top_similarities) == 10:
+            break
+
+    # Store the doctors in the respective list
+    doctors_list = [doctor for _, _, doctor in top_similarities]
+    doctors_lists[i] = doctors_list
+
+# Print the list of lists of doctors
+# print("医生名称数组:", doctors_lists)
+
+test_df = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1')
+
+# Extract the "doctor" column data from the test dataset
+test_doctors = test_df['doctor'].tolist()
+
+# Calculate accuracy
+correct_predictions = 0
+total_predictions = len(doctors_lists)
+
+for i, doctors in enumerate(doctors_lists):
+    if i < len(test_doctors) and len(doctors) > 0 and test_doctors[i] in doctors:
+        correct_predictions += 1
+
+# 准确率
+accuracy = correct_predictions / total_predictions
+print("准确率:", accuracy)
+
+# 测试集
+excel_file = '导医测试数据.xlsx'
+# sheet_name = 'Sheet1'
+sheet_name = 'Sheet1'
+df = pd.read_excel(excel_file, sheet_name=sheet_name)
+test_data = df['doctor'].tolist()
+test_data1 = [[doctor] for doctor in test_data]
+
+
+# 平均准确率 doctors_lists是推荐医生集，
+# 平均精准度
+def calculate_average_precision(candidate_set, test_set):
+    total_precision = 0.0
+    relevant_count = 0
+
+    for i in range(len(test_set)):
+        query = test_set[i]
+        candidates = candidate_set[i]
+        position = 0
+
+        for j in range(len(candidates)):
+            position += 1
+            if query in candidates[j]:
+                precision = 1.0 / position
+                total_precision += precision
+                relevant_count += 1
+                break
+
+    if relevant_count > 0:
+        return total_precision / len(candidate_set)
+    else:
+        return 0.0
+
+
+average_precision = calculate_average_precision(doctors_lists, test_data)
+print(f"平均精准度: {average_precision:.2%}")
--- a/word2vec/word2vec01.py
+++ b/word2vec/word2vec01.py
--- a/word2vec/word2vecSC.py
+++ b/word2vec/word2vecSC.py
+from gensim.models import Word2Vec
+from gensim.models.word2vec import LineSentence
+
+import pandas as pd
+from jieba import posseg
+
+# 读取Excel文件
+data = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1')
+# 获取标题列数据
+titles = data['ask'].tolist()
+
+with open('stopwords.txt', 'r', encoding='utf-8') as file:
+    stopwords = file.read().splitlines()
+segmented_titles = []
+for title in titles:
+    words = posseg.cut(title)
+    filtered_words = [word for word, flag in words if word not in stopwords]
+    segmented_title = ' '.join(filtered_words)
+    segmented_titles.append(segmented_title)
+
+# 保存分词后的数据到文件
+with open('data1.txt', 'w', encoding='utf-8') as file:
+    for title in segmented_titles:
+        file.write(title + '\n')
+
+
+
+# model = Word2Vec(LineSentence(open('data.txt', 'r', encoding='utf8')), sg=0, vector_size=20, window=5, min_count=1, workers=4)
+#
+# # 词向量保存
+# model.wv.save_word2vec_format('data.vector', binary=False)
+#
+# # 模型保存
+# model.save('test.model')
\ No newline at end of file
--- a/word2vec/导医测试数据 - 副本.xlsx
+++ b/word2vec/导医测试数据 - 副本.xlsx
--- a/word2vec/导医测试数据.xlsx
+++ b/word2vec/导医测试数据.xlsx
--- a/word2vec/消化内科对话_已解决问题0510.xlsx
+++ b/word2vec/消化内科对话_已解决问题0510.xlsx
--- a/word2vec/词向量特征.txt
+++ b/word2vec/词向量特征.txt
-Word2vec
-FastText
-GloVe
-ELMO
\ No newline at end of file