Commit 97b21438 authored by lichusong's avatar lichusong

0716-END0516文件放SED模型(sentence bert elmo ...)

parent 176a2f4c
import math
import numpy as np
import pandas as pd
from elmoformanylangs import Embedder
import handler
def calculate_cosine_similarity(target_indices, output_path):
e = Embedder('../../model/zhs.model')
# 相似度目标文本
df = pd.read_excel(r'../../data-dev/医生分词结果词性表0511.xlsx', sheet_name='Sheet1')
# sents = df['word_list'].fillna('').tolist()
# elmo_vecs = e.sents2elmo(sents, output_layer=-1)
# 加载Elmo处理结果的文件
elmo_vecs = np.load('elmo_vectors.npy', allow_pickle=True)
# 词性,词索引
df1 = pd.read_excel(r'../../data-dev/医生分词结果词性表0511.xlsx', sheet_name='Sheet1')
# 定义权重字典 动词 名词 形容词 副词 连词 未知词
weight_dict = {'v': 1.2, 'n': 1.2, 'a': 0.8, 'd': 0.8, 'c': 0.6, 'u': 0.6}
# 字符向量转词向量,并处理权重
word_vecs_plus = []
for i, char_vecs in enumerate(elmo_vecs):
# 定位
pos_list = eval(df1.loc[i, 'pos_list'])
pos_index = eval(df1.loc[i, 'pos_index'])
words = []
for j, (start, end) in enumerate(pos_index):
word_vec = sum(char_vecs[start:end]) / (end - start) # average character vectors
pos = pos_list[j]
if pos in weight_dict:
weight = weight_dict[pos]
word_vec *= weight
word = {'vector': word_vec}
words.append(word)
word_vecs_plus.append(words)
# 格式转换
word_vecs_plus_fix = []
for n in word_vecs_plus:
my_array = np.zeros((0, 1024))
for m in n:
vector = m['vector']
my_array = np.vstack([my_array, vector])
word_vecs_plus_fix.append(my_array)
print("================222222==============")
results = []
# 计算每个句子与第x个句子的余弦相似度
for target_index in target_indices:
target_vec = word_vecs_plus_fix[target_index]
sims = []
for vec in word_vecs_plus_fix:
vec.mean(axis=0)
if isinstance(vec, float) and math.isnan(vec):
sims.append('')
else:
sims.append(np.dot(vec.mean(axis=0), target_vec.mean(axis=0)) / (
np.linalg.norm(vec.mean(axis=0)) * np.linalg.norm(target_vec.mean(axis=0))))
# 将余弦相似度添加到原DataFrame中,并将结果保存为Excel文件
df['similarity'] = sims
# 根据相似度降序排列,并获取前五个
top_five = df.nlargest(6, 'similarity')
# 提取前五个的 idx 和对应的 similarity
top_five_idx = top_five['idx'].tolist()
top_five_similarity = top_five['similarity'].tolist()
# 输出前五个的 idx 和对应的 similarity
for i in range(1, len(top_five_idx)):
idx = top_five_idx[i]
similarity = top_five_similarity[i]
find_minimum_idx = handler.find_doctor_id_by_idx(idx)
name_by_idx = handler.find_doctor_name_by_idx(find_minimum_idx)
name_doctor = name_by_idx[0]
# print(f"编号:{idx}, 医生集B: name: {name_by_idx}, Score: {similarity:.4f}")
result = {
'医生集': 'B',
'医生名称': name_doctor,
'相似度得分': similarity
}
results.append(result)
return results
\ No newline at end of file
import pandas as pd
import numpy as np
def find_top_similar_doctors(target_indices):
# 计算向量的余弦相似度
def cosine_similarity(u, v):
dot_product = np.dot(u, v)
norm_u = np.linalg.norm(u)
norm_v = np.linalg.norm(v)
similarity = dot_product / (norm_u * norm_v)
return similarity
# 读取 Excel 文件
df = pd.read_excel('../../data-dev/消化内科对话_已解决问题0510.xlsx', sheet_name='医生属性+')
# 提取所需的列数据
columns = ['医生职位', '所属医院等级', '科室', '健康顾问(元)', '图文咨询', '指定咨询(元)', '采纳率', '回答总数',
'好评数', '满意度']
data = df[columns].values
# 计算相似度并保存到结果列表中
result = []
# 计算相似度并保存到 Excel 文件中
similarity_df = pd.DataFrame(columns=['idx', '相似idx', '相似度'])
for i in target_indices:
row_i = data[i - 1]
similarities = []
for j in range(len(data)):
if j != i - 1:
similarity = cosine_similarity(row_i, data[j])
similarities.append((j + 1, similarity))
similarities.sort(key=lambda x: x[1], reverse=True)
top_five = similarities[:5]
result.append({'idx': i, 'similarities': top_five})
# 将结果保存到 Excel 文件中
similarity_df.to_excel('../../data-dev/医生属性相似度2.xlsx', index=False)
return result
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
class OutputQStruct:
def __init__(self, idx, name, sentence, score):
self.idx = idx
self.name = name
self.sentence = sentence
self.score = score
def process_data(query):
# 创建一个空的输出值列表
output_values = []
# 加载BERT模型和tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = AutoModel.from_pretrained('bert-base-chinese')
# 加载数据
df = pd.read_excel('../../data-dev/消化内科对话_已解决问题0510.xlsx', sheet_name='对话数据+')
corpus = df['标题'].tolist()
# 将所有文本向量化
encoded_dict = tokenizer(corpus, padding=True, truncation=True, max_length=32, return_tensors='pt')
input_ids = encoded_dict['input_ids']
attention_mask = encoded_dict['attention_mask']
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
embeddings = outputs.last_hidden_state[:, 0, :].numpy()
query_encoded = tokenizer.encode_plus(
query, add_special_tokens=True, max_length=32, padding='longest', truncation=True, return_tensors='pt')
query_input_ids = query_encoded['input_ids']
query_attention_mask = query_encoded['attention_mask']
with torch.no_grad():
query_output = model(query_input_ids, attention_mask=query_attention_mask)
query_embedding = query_output.last_hidden_state[:, 0, :].numpy()
similarities = np.inner(query_embedding, embeddings)
similarities = similarities.squeeze() # 移除不必要的维度
scaler = MinMaxScaler()
similarities = scaler.fit_transform(similarities.reshape(-1, 1)).squeeze() # 归一化到[0,1]
top200_idx = similarities.argsort()[::-1][:50]
for idx in top200_idx:
output_struct = OutputQStruct(df.iloc[idx]['idx'], df.iloc[idx]['医生姓名'],
corpus[idx].strip(), similarities[idx])
output_values.append(output_struct)
# 返回输出值列表
return output_values
import pandas as pd
def find_minimum_idx(matching_data):
# 存储相同sentence的数据
same_sentence_data = {}
# 遍历匹配的数据列表
for data in matching_data:
sentence = data.sentence
idx = data.idx
# 检查sentence是否已经存在于same_sentence_data中
if sentence in same_sentence_data:
# 如果已存在,则更新为更小的idx
if idx < same_sentence_data[sentence]:
same_sentence_data[sentence] = idx
else:
# 如果不存在,则添加到same_sentence_data中
same_sentence_data[sentence] = idx
# 将最小idx组成的数组返回
return sorted(list(same_sentence_data.values()))
def find_doctor_id_by_idx(idx):
excel_file_path = '../../data-dev/消化内科对话_已解决问题0510.xlsx'
# 读取包含 idx 和医生名称的 Excel 表
df = pd.read_excel(excel_file_path, sheet_name='对话数据+')
# 获取 idx 列和医生名称列的数据
idx_column = df['idx']
doctor_name_column = df['医生姓名']
# 查找医生名称对应的行索引
row_index = idx_column[idx_column == idx].index[0]
# 获取医生名称
doctor_name = doctor_name_column[row_index]
# 查找医生名称对应的医生 ID(假设医生 ID 在另一张工作表的 '医生名称' 和 '医生ID' 列中)
doctor_info_df = pd.read_excel(excel_file_path, sheet_name='医生属性+')
doctor_id = doctor_info_df.loc[doctor_info_df['医生姓名'] == doctor_name, '医生标识'].values[0]
return doctor_id
def find_doctor_name_by_idx(idx):
# 读取 Excel 文件
df = pd.read_excel('../../data-dev/消化内科对话_已解决问题0510.xlsx', sheet_name='医生属性+')
# 根据 idx 列筛选匹配的行
matching_rows = df.loc[df['医生标识'] == idx]
# 提取医生姓名列
doctor_names = matching_rows['医生姓名'].tolist()
# 返回医生姓名列表
return doctor_names
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment