In [1]:
# Uncomment line below to install exlib
# !pip install exlib
In [ ]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import numpy as np
import pandas as pd
import tqdm
from tqdm import tqdm
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
import sentence_transformers

import sys
sys.path.append('../../src')
import exlib
from exlib.datasets.multilingual_politeness import PolitenessDataset, PolitenessClassifier, PolitenessFixScore, get_politeness_scores
from exlib.datasets.politeness_helper import load_lexica

from exlib.features.text import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load datasets and pre-trained model¶

Sample inference on dataset¶

In [3]:
torch.manual_seed(1234)

dataset = PolitenessDataset("test")
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)
model = PolitenessClassifier()
model.to(device)
model.eval()

for batch in tqdm(dataloader): 
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    utterances = [dataset.tokenizer.decode(input_id, skip_special_tokens=True) for input_id in input_ids]
    for utterance, label in zip(utterances, output):
        print("Text: {}\nPoliteness: {}\n".format(utterance, label.item()))
    break
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at anonymized-model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|                                                                                          | 0/143 [00:00<?, ?it/s]
Text: The intro mentions the ISO 8601 international standard adopted in most western countries. What does this even mean? Who are we suggesting has done the adoption?
Politeness: 0.21642041206359863

Text: I'm a user on PrettyCure.org, and somebody on the site said they are making a fourth season of PreCure. It's a rumuor, but is it true? That person said it's more like Tokyo Mew Mew, a group of girls.
Politeness: 0.21494880318641663

Text: Hello fellow Wikipedians, I have just added archive links to on Essen. Please take a moment to review my edit. If necessary, add after the link to keep me from modifying it.
Politeness: 0.12163643538951874

Text: I saw the template citing this issue and since there was no section here discussing it I've decided to start one. I'm a Canadian and most of our television programs are also aired in the US so my knowledge of what's on TV outside of North America is limited. So I'm not sure of how much help I can be, but I do have some ideas on how to improve this section and I'm open to feedback.
Politeness: 0.09295740723609924


Baselines¶

In [4]:
all_baselines_scores = get_politeness_scores(baselines = ['word', 'phrase', 'sentence'])
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at anonymized-model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
---- word Level Groups ----
100%|████████████████████████████████████████████████████████████████████████████████| 143/143 [01:22<00:00,  1.73it/s]
---- phrase Level Groups ----
100%|████████████████████████████████████████████████████████████████████████████████| 143/143 [02:05<00:00,  1.14it/s]
---- sentence Level Groups ----
100%|████████████████████████████████████████████████████████████████████████████████| 143/143 [01:53<00:00,  1.27it/s]
In [5]:
for name in all_baselines_scores:
    metric = torch.tensor(all_baselines_scores[name])
    mean_metric = metric.nanmean()
    print(f'BASELINE {name} mean score: {mean_metric}')
BASELINE word mean score: 0.6839182740178517
BASELINE phrase mean score: 0.6350535143089757
BASELINE sentence mean score: 0.6108726882043238

BERTopic (Clustering)¶

create topics from the dataset

In [6]:
# first, save all the utterances

dataset = PolitenessDataset("test")
utterances = [' '.join(dataset[i]['word_list']) for i in range(len(dataset))]
# torch.save(utterances, '../../fix/utterances/multilingual_politeness_test.pt')
In [ ]:
# then, install bertopic + use them on the utterances

!pip install bertopic
In [8]:
all_baselines_scores = get_politeness_scores(baselines = ['clustering'])

# make sure utterances_path is set to where utteraces is saved in your directory
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at anonymized-model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
---- clustering Level Groups ----
100%|████████████████████████████████████████████████████████████████████████████████| 143/143 [00:17<00:00,  8.06it/s]
In [9]:
for name, score in all_baselines_scores.items():
    print(f'BASELINE {name} mean score: {score.mean()}')
BASELINE clustering mean score: 0.6679689280296627
In [ ]: