Точно настроенная модель BERT не работает в соответствии с обучением

Я настроил модель BERT для поведения по умолчанию схожести текста на итальянском языке. Я вручную установил оценку схожести схожести текста, но все же после точной настройки она не работает должным образом.

с использованием предварительно обученной модели:

      Query: latte al cioccolato  #chocolate milk

Top 3 most similar sentences in the corpus:
Milka  cioccolato al latte 100 g (Score: 0.7714)   #Milka milk chocolate 100 g
Alpro, Cioccolato bevanda a base di soia 1 ltr (Score: 0.5586)  #Alpro, Chocolate soy drink 1 ltr(soya milk)
Danone, HiPRO 25g Proteine gusto cioccolato 330 ml (Score: 0.4569) #Danone, HiPRO 25g Protein chocolate flavor 330 ml(protein chocolate milk)

Здесь, в приведенном выше примере, проблема в том, что предварительно обученная модель BERT не возвращает подобие контекста. результат должен быть в следующем порядке.

Ожидаемый результат:

      Query: latte al cioccolato  #chocolate milk

Top 3 most similar sentences in the corpus:
Alpro, Cioccolato bevanda a base di soia 1 ltr (Score: 0.99)  #Alpro, Chocolate soy drink 1 ltr(soya milk)
Danone, HiPRO 25g Proteine gusto cioccolato 330 ml (Score: 0.95) #Danone, HiPRO 25g Protein chocolate flavor 330 ml(protein chocolate milk)
Milka  cioccolato al latte 100 g (Score: 0.40)   #Milka milk chocolate 100 g

поэтому для достижения ожидаемого результата я настроил предварительно обученную модель, но по-прежнему возвращает тот же результат, пожалуйста, помогите мне с этим.

Тонкая настройка:

      import pandas as pd
# initialise data of lists.
data = {'input':[
          "Alpro, Cioccolato bevanda a base di soia 1 ltr", #Alpro, Chocolate soy drink 1 ltr
          "Milka  cioccolato al latte 100 g", #Milka milk chocolate 100 g
          "Danone, HiPRO 25g Proteine gusto cioccolato 330 ml", #Danone, HiPRO 25g Protein chocolate flavor 330 ml
         ]
        }
 
# Creates pandas DataFrame.
x_sample = pd.DataFrame(data)
print(x_sample['input'])

# load model
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation
from torch.utils.data import DataLoader

embedder = SentenceTransformer('distiluse-base-multilingual-cased') # or any other pretrained model
print("embedder loaded...")

# define your train dataset, the dataloader, and the train loss
# train_dataset = SentencesDataset(x_sample["input"].tolist(), embedder)
# train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=4, num_workers=1)
# train_loss = losses.CosineSimilarityLoss(embedder)

# dummy evaluator to make the api work
sentences1 = ['latte al cioccolato', 'latte al cioccolato','latte al cioccolato']
sentences2 = ['Alpro, Cioccolato bevanda a base di soia 1 ltr', 'Danone, HiPRO 25g Proteine gusto cioccolato 330 ml','Milka  cioccolato al latte 100 g']
scores = [0.99,0.95,0.4]
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

examples = []
for s1,s2,l in zip(sentences1, sentences2, scores):
  examples.append(InputExample(texts=[s1, s2], label=l))
train_dataloader = DataLoader(examples, shuffle=False, batch_size=4, num_workers=1)
train_loss = losses.CosineSimilarityLoss(embedder)
# tune the model
embedder.fit(train_objectives=[(train_dataloader, train_loss)], 
    epochs=5, 
    warmup_steps=500, 
    evaluator=evaluator, 
    evaluation_steps=1,
    output_path='fine_tuned_bert',
    save_best_model= True,
    show_progress_bar= True
    )

Используя приведенную выше точно настроенную модель

      import scipy
import numpy as np
from sentence_transformers import models, SentenceTransformer
# model = SentenceTransformer('distiluse-base-multilingual-cased') # workes with Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish
model = SentenceTransformer('/content/fine_tuned_bert') #your fine-tuned model


corpus = [
          "Alpro, Cioccolato bevanda a base di soia 1 ltr", #Alpro, Chocolate soy drink 1 ltr
          "Milka  cioccolato al latte 100 g", #Milka milk chocolate 100 g
          "Danone, HiPRO 25g Proteine gusto cioccolato 330 ml", #Danone, HiPRO 25g Protein chocolate flavor 330 ml
         ]
corpus_embeddings = model.encode(corpus)

queries = [
            'latte al cioccolato', #milk with chocolate flavor,
          ]
query_embeddings = model.encode(queries)

# Calculate Cosine similarity of query against each sentence i
closest_n = 10
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n======================\n")
    print("Query:", query)
    print("\nTop 10 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))

доработанный результат модели:

      ======================

Query: latte al cioccolato

Top 10 most similar sentences in corpus:
Milka  cioccolato al latte 100 g (Score: 0.7714)
Alpro, Cioccolato bevanda a base di soia 1 ltr (Score: 0.5586)
Danone, HiPRO 25g Proteine gusto cioccolato 330 ml (Score: 0.4569)

0 ответов

Другие вопросы по тегам