from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
import pandas as pd

# Load the pre-trained Word2Vec model
model = Word2Vec.load('../demo/test.model')

# Load the Excel file
df = pd.read_excel('消化内科对话_已解决问题0510.xlsx', sheet_name='w')

# Extract the title and doctor name columns data
titles = df['标题'].tolist()
doctors = df['医生姓名'].tolist()

# Input sentences
# sentences = ["请 彩超 报告 分析", "非 萎缩 胃窦炎 伴 糜烂", "脾胃 湿热 消化不良 有时候 脚 冰凉 小便 次"]
# 读取Excel文件
df = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1')
sentences = df['ask'].tolist()

# Initialize the list of lists to store doctors
doctors_lists = [[] for _ in range(len(sentences))]

for i, sentence in enumerate(sentences):
    # Preprocess the sentence
    preprocessed_sentence = simple_preprocess(sentence)

    # Calculate similarity between the sentence and titles
    similarities = []
    for title, doctor in zip(titles, doctors):
        # Convert title and doctor to string
        title = str(title)
        doctor = str(doctor)

        preprocessed_title = simple_preprocess(title)

        # Check if either list is empty
        if not preprocessed_sentence or not preprocessed_title:
            continue

        similarity = model.wv.n_similarity(preprocessed_sentence, preprocessed_title)

        # Exclude similarity of 1
        if similarity == 1.0:
            continue

        similarities.append((similarity, title, doctor))

    # Sort similarities in descending order
    similarities.sort(reverse=True)

    # Get the top ten similarities while excluding same titles
    top_similarities = []
    seen_titles = set()
    for similarity, title, doctor in similarities:
        if title not in seen_titles:
            top_similarities.append((similarity, title, doctor))
            seen_titles.add(title)
        if len(top_similarities) == 10:
            break

    # Store the doctors in the respective list
    doctors_list = [doctor for _, _, doctor in top_similarities]
    doctors_lists[i] = doctors_list

# Print the list of lists of doctors
# print("医生名称数组:", doctors_lists)

test_df = pd.read_excel('导医测试数据.xlsx', sheet_name='Sheet1')

# Extract the "doctor" column data from the test dataset
test_doctors = test_df['doctor'].tolist()

# Calculate accuracy
correct_predictions = 0
total_predictions = len(doctors_lists)

for i, doctors in enumerate(doctors_lists):
    if i < len(test_doctors) and len(doctors) > 0 and test_doctors[i] in doctors:
        correct_predictions += 1

# 准确率
accuracy = correct_predictions / total_predictions
print("准确率:", accuracy)

# 测试集
excel_file = '导医测试数据.xlsx'
# sheet_name = 'Sheet1'
sheet_name = 'Sheet1'
df = pd.read_excel(excel_file, sheet_name=sheet_name)
test_data = df['doctor'].tolist()
test_data1 = [[doctor] for doctor in test_data]


# 平均准确率 doctors_lists是推荐医生集，
# 平均精准度
def calculate_average_precision(candidate_set, test_set):
    total_precision = 0.0
    relevant_count = 0

    for i in range(len(test_set)):
        query = test_set[i]
        candidates = candidate_set[i]
        position = 0

        for j in range(len(candidates)):
            position += 1
            if query in candidates[j]:
                precision = 1.0 / position
                total_precision += precision
                relevant_count += 1
                break

    if relevant_count > 0:
        return total_precision / len(candidate_set)
    else:
        return 0.0


average_precision = calculate_average_precision(doctors_lists, test_data)
print(f"平均精准度: {average_precision:.2%}")