from transformers import AutoTokenizer, TFAutoModel import tensorflow as tf import matplotlib.pyplot as plt # 加载模型 model_name = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = TFAutoModel.from_pretrained(model_name, output_hidden_states=True) # Whether the model returns all hidden-states. # 输入测试句子 utt = ['今天的月亮又大又圆', '月亮真的好漂亮啊', '今天去看电影吧', "爱情睡醒了,天琪抱着小贝进酒店", "侠客行风万里"] inputs = tokenizer(utt, return_tensors="tf", padding="max_length", truncation=True, max_length=64) outputs = model(inputs) hidden_states = outputs[2] # 获得各个隐藏层输出 """ 解释下输出(hidden_states): 1. The layer number (13 layers) 2. The batch number (5 sentence) 也就是输入句子的个数 3. The word / token number (64 tokens in our sentence) 也就是max_length 4. The hidden unit / feature number (768 features) 疑惑点: 1.为啥是13层?bert不是12层吗? 第一层是输入的嵌入层,其余12层才是bert的 """ print("Number of layers:", len(hidden_states), " (initial embeddings + 12 BERT layers)") layer_i = 0 print("Number of batches:", len(hidden_states[layer_i])) batch_i = 0 print("Number of tokens:", len(hidden_states[layer_i][batch_i])) token_i = 0 print("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i])) # 查看下第一个句子第五个词在第五层的表示 token_i = 5 layer_i = 5 vec = hidden_states[layer_i][batch_i][token_i] print(vec) # Plot the values as a histogram to show their distribution.看下分布 plt.figure(figsize=(10, 10)) plt.hist(vec, bins=200) plt.show() # Concatenate the tensors for all layers. We use `stack` here to # create a new dimension in the tensor. sentence_embeddings = tf.stack(hidden_states, axis=0) # 在维度0的位置插入,也就是把13放入最前面 print(f"sentence_embeddings.shape : {sentence_embeddings.shape}") # 调换维度,使每个词都有13层的嵌入表示 sentence_embeddings_perm = tf.transpose(sentence_embeddings, perm=[1, 2, 0, 3]) print(f"sentence_embeddings_perm.shape : {sentence_embeddings_perm.shape}") # 获取词的稠密向量 ## 第一种方式:拼接后四层的稠密向量 for sentence_embedding in sentence_embeddings_perm: # 获取每个句子的embedding print(f"sentence_embedding.shape: {sentence_embedding.shape}") token_vecs_cat = [] for token_embedding in sentence_embedding: # 获取句子每个词的embedding print(f"token_embedding.shape : {token_embedding.shape}") cat_vec = tf.concat([token_embedding[-1], token_embedding[-2], token_embedding[-3], token_embedding[-4]], axis=0) print(f"cat_vec.shape : {cat_vec.shape}") token_vecs_cat.append(cat_vec) print(f"len(token_vecs_cat) : {len(token_vecs_cat)}") ## 第二种方式:加和后四层的稠密向量 for sentence_embedding in sentence_embeddings_perm: # 获取每个句子的embedding print(f"sentence_embedding.shape: {sentence_embedding.shape}") token_vecs_cat = [] for token_embedding in sentence_embedding: # 获取句子每个词的embedding print(f"token_embedding.shape : {token_embedding.shape}") cat_vec = sum(token_embedding[-4:]) print(f"cat_vec.shape : {cat_vec.shape}") token_vecs_cat.append(cat_vec) print(f"len(token_vecs_cat) : {len(token_vecs_cat)}") # 获取句子的稠密向量 ## 平均每个token倒数第二层的稠密向量 token_vecs = sentence_embeddings[-2] print(f"token_vecs.shape : {token_vecs.shape}") sentences_embedding = tf.reduce_mean(token_vecs, axis=1) print(f"sentences_embedding.shape : {sentences_embedding.shape}") # 计算余弦相似度 ## 不同句子间的相似度 tensor_test = sentences_embedding[0] consine_sim_tensor = tf.keras.losses.cosine_similarity(tensor_test, sentences_embedding) print(f"consine_sim_tensor : {consine_sim_tensor}") ## 探讨下相同词bank在不同上下文时其vector的相似度 utt = ["After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."] inputs = tokenizer(utt, return_tensors="tf", padding="max_length", truncation=True, max_length=22) """ 0 [CLS] 1 after 2 stealing 3 money 4 from 5 the 6 bank 7 vault 8 , 9 the 10 bank 11 robber 12 was 13 seen 14 fishing 15 on 16 the 17 mississippi 18 river 19 bank 20 . 21 [SEP] bank单词的位置分别在6, 10, 19 """ outputs = model(inputs) hidden_states = outputs[2] # 获得各个隐藏层输出 tokens_embedding = tf.reduce_sum(hidden_states[-4:], axis=0) # 使用加和方式 bank_vault = tokens_embedding[0][6] bank_robber = tokens_embedding[0][10] river_bank = tokens_embedding[0][19] consine_sim_tensor = tf.keras.losses.cosine_similarity(bank_vault, [bank_robber, river_bank]) print(f"consine_sim_tensor : {consine_sim_tensor}") # consine_sim_tensor : [-0.93863535 -0.69570863] # 可以看出bank_vault(银行金库)和bank_robber(银行抢劫犯)中的bank相似度更高些,合理!