## Experiment exp027-2
xlm-roberta-large, Batch Size: 32, Learning Rate: 2e-5, Warmup Steps: 500

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    BertForTokenClassification,
    AutoModelForTokenClassification
)
import torch
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

In [3]:
class SpanClassifierWithStrictF1:
    def __init__(self, model_name="deepset/gbert-base"):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.labels =[
            "O",
            "B-positive feedback", "B-compliment", "B-affection declaration", "B-encouragement", "B-gratitude", "B-agreement", "B-ambiguous", "B-implicit", "B-group membership", "B-sympathy",
            "I-positive feedback", "I-compliment", "I-affection declaration", "I-encouragement", "I-gratitude", "I-agreement", "I-ambiguous", "I-implicit", "I-group membership", "I-sympathy"
        ]
        self.label2id = {label: i for i, label in enumerate(self.labels)}
        self.id2label = {i: label for i, label in enumerate(self.labels)}

    def create_dataset(self, comments_df, spans_df):
        """Erstelle Dataset mit BIO-Labels und speichere Evaluation-Daten"""
        examples = []
        eval_data = []  # Für Strict F1 Berechnung

        spans_grouped = spans_df.groupby(['document', 'comment_id'])

        for _, row in comments_df.iterrows():
            text = row['comment']
            document = row['document']
            comment_id = row['comment_id']
            key = (document, comment_id)

            # True spans für diesen Kommentar
            if key in spans_grouped.groups:
                true_spans = [(span_type, int(start), int(end))
                              for span_type, start, end in
                              spans_grouped.get_group(key)[['type', 'start', 'end']].values]
            else:
                true_spans = []

            # Tokenisierung
            tokenized = self.tokenizer(text, truncation=True, max_length=512,
                                       return_offsets_mapping=True)

            # BIO-Labels erstellen
            labels = self._create_bio_labels(tokenized['offset_mapping'],
                                             spans_grouped.get_group(key)[['start', 'end', 'type']].values
                                             if key in spans_grouped.groups else [])

            examples.append({
                'input_ids': tokenized['input_ids'],
                'attention_mask': tokenized['attention_mask'],
                'labels': labels
            })

            # Evaluation-Daten speichern
            eval_data.append({
                'text': text,
                'offset_mapping': tokenized['offset_mapping'],
                'true_spans': true_spans,
                'document': document,
                'comment_id': comment_id
            })

        return examples, eval_data

    def _create_bio_labels(self, offset_mapping, spans):
        """Erstelle BIO-Labels für Tokens"""
        labels = [0] * len(offset_mapping)  # 0 = "O"

        for start, end, type_label in spans:
            for i, (token_start, token_end) in enumerate(offset_mapping):
                if token_start is None:  # Spezielle Tokens
                    continue

                # Token überlappt mit Span
                if token_start < end and token_end > start:
                    if token_start <= start:
                        labels[i] = self.label2id[f'B-{type_label}'] # B-compliment
                    else:
                        labels[i] = self.label2id[f'I-{type_label}'] # I-compliment

        return labels

    def compute_metrics(self, eval_pred):
        """Berechne Strict F1 für Trainer"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=2)

        # Konvertiere Vorhersagen zu Spans
        batch_pred_spans = []
        batch_true_spans = []

        for i, (pred_seq, label_seq) in enumerate(zip(predictions, labels)):
            # Evaluation-Daten für dieses Beispiel
            if i < len(self.current_eval_data):
                eval_item = self.current_eval_data[i]
                text = eval_item['text']
                offset_mapping = eval_item['offset_mapping']
                true_spans = eval_item['true_spans']

                # Filtere gültige Vorhersagen (keine Padding-Tokens)
                valid_predictions = []
                valid_offsets = []

                for j, (pred_label, true_label) in enumerate(zip(pred_seq, label_seq)):
                    if true_label != -100 and j < len(offset_mapping):
                        valid_predictions.append(pred_label)
                        valid_offsets.append(offset_mapping[j])

                # Konvertiere zu Spans
                pred_spans = self._predictions_to_spans(valid_predictions, valid_offsets, text)
                pred_spans_tuples = [(span['type'], span['start'], span['end']) for span in pred_spans]

                batch_pred_spans.append(pred_spans_tuples)
                batch_true_spans.append(true_spans)

        # Berechne Strict F1
        strict_f1, strict_precision, strict_recall, tp, fp, fn = self._calculate_strict_f1(
            batch_true_spans, batch_pred_spans
        )

        torch.cuda.memory.empty_cache()

        return {
            "strict_f1": torch.tensor(strict_f1),
            "strict_precision": torch.tensor(strict_precision),
            "strict_recall": torch.tensor(strict_recall),
            "true_positives": torch.tensor(tp),
            "false_positives": torch.tensor(fp),
            "false_negatives": torch.tensor(fn)
        }

    def _calculate_strict_f1(self, true_spans_list, pred_spans_list):
        """Berechne Strict F1 über alle Kommentare"""
        tp, fp, fn = 0, 0, 0

        for true_spans, pred_spans in zip(true_spans_list, pred_spans_list):
            # Finde exakte Matches (Typ und Span müssen übereinstimmen)
            matches = self._find_exact_matches(true_spans, pred_spans)

            tp += len(matches)
            fp += len(pred_spans) - len(matches)
            fn += len(true_spans) - len(matches)

        # Berechne Metriken
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        return f1, precision, recall, tp, fp, fn

    def _find_exact_matches(self, true_spans, pred_spans):
        """Finde exakte Matches zwischen True und Predicted Spans"""
        matches = []
        used_pred = set()

        for true_span in true_spans:
            for i, pred_span in enumerate(pred_spans):
                if i not in used_pred and true_span == pred_span:
                    matches.append((true_span, pred_span))
                    used_pred.add(i)
                    break

        return matches

    def _predictions_to_spans(self, predicted_labels, offset_mapping, text):
        """Konvertiere Token-Vorhersagen zu Spans"""
        spans = []
        current_span = None

        for i, label_id in enumerate(predicted_labels):
            if i >= len(offset_mapping):
                break

            label = self.id2label[label_id]
            token_start, token_end = offset_mapping[i]

            if token_start is None:
                continue

            if label.startswith('B-'):
                if current_span:
                    spans.append(current_span)
                current_span = {
                    'type': label[2:],
                    'start': token_start,
                    'end': token_end,
                    'text': text[token_start:token_end]
                }
            elif label.startswith('I-') and current_span:
                current_span['end'] = token_end
                current_span['text'] = text[current_span['start']:current_span['end']]
            else:
                if current_span:
                    spans.append(current_span)
                    current_span = None

        if current_span:
            spans.append(current_span)

        return spans

    def predict(self, texts):
        """Vorhersage für neue Texte"""
        if not hasattr(self, 'model'):
            raise ValueError("Modell muss erst trainiert werden!")

        predictions = []
        device = next(self.model.parameters()).device

        for text in texts:
            # Tokenisierung
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True,
                                    max_length=512, return_offsets_mapping=True)

            offset_mapping = inputs.pop('offset_mapping')
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Vorhersage
            with torch.no_grad():
                outputs = self.model(**inputs)

            predicted_labels = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()

            # Spans extrahieren
            spans = self._predictions_to_spans(predicted_labels, offset_mapping[0], text)
            predictions.append({'text': text, 'spans': spans})

        return predictions

    def evaluate_strict_f1(self, comments_df, spans_df):
        """Evaluiere Strict F1 auf Test-Daten"""
        if not hasattr(self, 'model'):
            raise ValueError("Modell muss erst trainiert werden!")

        print("Evaluiere Strict F1...")

        # Vorhersagen für alle Kommentare
        texts = comments_df['comment'].tolist()
        predictions = self.predict(texts)

        # Organisiere True Spans
        spans_grouped = spans_df.groupby(['document', 'comment_id'])
        true_spans_dict = {}
        pred_spans_dict = {}

        for i, (_, row) in enumerate(comments_df.iterrows()):
            key = (row['document'], row['comment_id'])

            # True spans
            if key in spans_grouped.groups:
                true_spans = [(span_type, int(start), int(end))
                              for span_type, start, end in
                              spans_grouped.get_group(key)[['type', 'start', 'end']].values]
            else:
                true_spans = []

            # Predicted spans
            pred_spans = [(span['type'], span['start'], span['end'])
                          for span in predictions[i]['spans']]

            true_spans_dict[key] = true_spans
            pred_spans_dict[key] = pred_spans

        # Berechne Strict F1
        all_true_spans = list(true_spans_dict.values())
        all_pred_spans = list(pred_spans_dict.values())

        f1, precision, recall, tp, fp, fn = self._calculate_strict_f1(all_true_spans, all_pred_spans)

        print(f"\nStrict F1 Ergebnisse:")
        print(f"Precision: {precision:.4f}")
        print(f"Recall:    {recall:.4f}")
        print(f"F1-Score:  {f1:.4f}")
        print(f"True Positives: {tp}, False Positives: {fp}, False Negatives: {fn}")

        return {
            'strict_f1': f1,
            'strict_precision': precision,
            'strict_recall': recall,
            'true_positives': tp,
            'false_positives': fp,
            'false_negatives': fn
        }

def convert_spans(row):
    spans = row['predicted_spans']
    document = row['document']
    comment_id = row['comment_id']
    return [{'document': document, 'comment_id': comment_id, 'type': span['type'], 'start': span['start'], 'end': span['end']} for span in spans]

def pred_to_spans(row):
    predicted_labels, offset_mapping, text = row['predicted_labels'], row['offset_mapping'], row['comment']
    return [classifier._predictions_to_spans(predicted_labels, offset_mapping, text)]

In [4]:
comments: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/training data/comments.csv")
task1: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/training data/task1.csv")
task2: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/training data/task2.csv")
comments = comments.merge(task1, on=["document", "comment_id"])
spans_grouped = task2.groupby(['document', 'comment_id'])

test_data: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/test data/comments.csv")

# check every comment that contain spans if they have overlapping start and end positions
task2['overlap'] = False
overlapping_spans = task2.groupby(['document', 'comment_id'])
for (doc, comment), group in overlapping_spans:
    if len(group) > 1:
        starts = group['start'].tolist()
        ends = group['end'].tolist()
        for i in range(len(starts)):
            for j in range(i + 1, len(starts)):
                if not (ends[i] <= starts[j] or ends[j] <= starts[i]):
                    task2.loc[(task2['document'] == doc) & (task2['comment_id'] == comment), 'overlap'] = True

task2 = task2[task2['overlap'] == False].drop(columns=['overlap'])

In [6]:
from multiset import *
ALL_LABELS = ["affection declaration","agreement","ambiguous",
              "compliment","encouragement","gratitude","group membership",
              "implicit","positive feedback","sympathy"]

def fine_grained_flausch_by_label(gold, predicted):
    gold['cid']= gold['document']+"_"+gold['comment_id'].apply(str)
    predicted['cid']= predicted['document']+"_"+predicted['comment_id'].apply(str)

    # annotation sets (predicted)
    pred_spans = Multiset()
    pred_spans_loose = Multiset()
    pred_types = Multiset()

    # annotation sets (gold)
    gold_spans = Multiset()
    gold_spans_loose = Multiset()
    gold_types = Multiset()

    for row in predicted.itertuples(index=False):
        pred_spans.add((row.cid,row.type,row.start,row.end))
        pred_spans_loose.add((row.cid,row.start,row.end))
        pred_types.add((row.cid,row.type))
    for row in gold.itertuples(index=False):
        gold_spans.add((row.cid,row.type,row.start,row.end))
        gold_spans_loose.add((row.cid,row.start,row.end))
        gold_types.add((row.cid,row.type))

    # precision = true_pos / true_pos + false_pos
    # recall = true_pos / true_pos + false_neg
    # f_1 = 2 * prec * rec / (prec + rec)

    results = {'TOTAL': {'STRICT': {},'SPANS': {},'TYPES': {}}}
    # label-wise evaluation (only for strict and type)
    for label in ALL_LABELS:
        results[label] = {'STRICT': {},'TYPES': {}}
        gold_spans_x = set(filter(lambda x: x[1].__eq__(label), gold_spans))
        pred_spans_x = set(filter(lambda x: x[1].__eq__(label), pred_spans))
        gold_types_x = set(filter(lambda x: x[1].__eq__(label), gold_types))
        pred_types_x = set(filter(lambda x: x[1].__eq__(label), pred_types))

        # strict: spans + type must match
        ### NOTE: x and y / x returns 0 if x = 0 and y/x otherwise (test for zero division)
        strict_p = float(len(pred_spans_x)) and float( len(gold_spans_x.intersection(pred_spans_x))) / len(pred_spans_x)
        strict_r = float(len(gold_spans_x)) and float( len(gold_spans_x.intersection(pred_spans_x))) / len(gold_spans_x)
        strict_f = (strict_p + strict_r) and 2 * strict_p * strict_r / (strict_p + strict_r)
        results[label]['STRICT']['prec'] = strict_p
        results[label]['STRICT']['rec'] = strict_r
        results[label]['STRICT']['f1'] = strict_f

        # detection mode: only types must match (per post)
        types_p = float(len(pred_types_x)) and float( len(gold_types_x.intersection(pred_types_x))) / len(pred_types_x)
        types_r = float(len(gold_types_x)) and float( len(gold_types_x.intersection(pred_types_x))) / len(gold_types_x)
        types_f = (types_p + types_r) and 2 * types_p * types_r / (types_p + types_r)
        results[label]['TYPES']['prec'] = types_p
        results[label]['TYPES']['rec'] = types_r
        results[label]['TYPES']['f1'] = types_f

    # Overall evaluation
    # strict: spans + type must match
    strict_p = float(len(pred_spans)) and float( len(gold_spans.intersection(pred_spans))) / len(pred_spans)
    strict_r = float(len(gold_spans)) and float( len(gold_spans.intersection(pred_spans))) / len(gold_spans)
    strict_f = (strict_p + strict_r) and 2 * strict_p * strict_r / (strict_p + strict_r)
    results['TOTAL']['STRICT']['prec'] = strict_p
    results['TOTAL']['STRICT']['rec'] = strict_r
    results['TOTAL']['STRICT']['f1'] = strict_f

    # spans: spans must match
    spans_p = float(len(pred_spans_loose)) and float( len(gold_spans_loose.intersection(pred_spans_loose))) / len(pred_spans_loose)
    spans_r = float(len(gold_spans_loose)) and float( len(gold_spans_loose.intersection(pred_spans_loose))) / len(gold_spans_loose)
    spans_f = (spans_p + spans_r) and 2 * spans_p * spans_r / (spans_p + spans_r)
    results['TOTAL']['SPANS']['prec'] = spans_p
    results['TOTAL']['SPANS']['rec'] = spans_r
    results['TOTAL']['SPANS']['f1'] = spans_f

    # detection mode: only types must match (per post)
    types_p = float(len(pred_types)) and float( len(gold_types.intersection(pred_types))) / len(pred_types)
    types_r = float(len(gold_types)) and float( len(gold_types.intersection(pred_types))) / len(gold_types)
    types_f = (types_p + types_r) and 2 * types_p * types_r / (types_p + types_r)
    results['TOTAL']['TYPES']['prec'] = types_p
    results['TOTAL']['TYPES']['rec'] = types_r
    results['TOTAL']['TYPES']['f1'] = types_f

#    print("STRICT:\n ",strict_p,strict_r,strict_f)
#    print("SPANS:\n ",spans_p,spans_r,spans_f)
#    print("TYPES:\n ",types_p,types_r,types_f)
    return(results)

In [7]:
classifier = SpanClassifierWithStrictF1('xlm-roberta-large')

In [8]:
# Dataset neu erstellen für diesen Fold
examples, eval_data = classifier.create_dataset(comments, task2)
train_examples, val_examples = train_test_split(examples, test_size=0.1, random_state=42)

# Evaluation-Daten entsprechend aufteilen
train_indices, val_indices = train_test_split(range(len(examples)), test_size=0.1, random_state=42)

In [13]:
classifier.model = AutoModelForTokenClassification.from_pretrained(
    'xlm-roberta-large',
    num_labels=len(classifier.labels),
    id2label=classifier.id2label,
    label2id=classifier.label2id
)
classifier.model.load_state_dict(torch.load('./experiments/exp027/exp027-2_retraining_final_model.pth'))
classifier.model.eval()

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_f

In [15]:
len(comments.iloc[val_indices].comment.tolist())

3706

In [16]:
comments_val = comments.iloc[val_indices].copy()
comments_val.reset_index(drop=True, inplace=True)

In [17]:
test_comments = comments_val.copy()

comments_val['gold_spans'] = None
comments_val['predicted_labels'] = None
comments_val['predicted_probs'] = None
comments_val['offset_mapping'] = None
comments_val['text_tokens'] = None

for idx in range(len(comments_val)): #range(15):
    row = comments_val.iloc[idx]
    text = row['comment']
    key = (row['document'], row['comment_id'])

    text_tokens = classifier.tokenizer.tokenize(text)
    comments_val.at[idx, 'text_tokens'] = text_tokens

    device = next(classifier.model.parameters()).device
    inputs = classifier.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, return_offsets_mapping=True)

    offset_mapping = inputs.pop('offset_mapping')
    comments_val.at[idx, 'offset_mapping'] = offset_mapping.cpu().numpy()[0].tolist()
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Vorhersage
    with torch.no_grad():
        outputs = classifier.model(**inputs)

    predicted_labels = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    predicted_probs = torch.nn.functional.softmax(outputs.logits, dim=2)[0].cpu().numpy()
    comments_val.at[idx, 'predicted_labels'] = predicted_labels
    comments_val.at[idx, 'predicted_probs'] = predicted_probs

    if key not in spans_grouped.groups:
        comments_val.at[idx, 'gold_spans'] = []
        pass
    else:
        spans = spans_grouped.get_group(key).to_dict(orient='records')
        comments_val.at[idx, 'gold_spans'] = spans

# or simply predict like this witout probabilities:
# val_set_predictions = classifier.predict(comments_val.comment.tolist())

Token indices sequence length is longer than the specified maximum sequence length for this model (1035 > 512). Running this sequence through the model will result in indexing errors


In [36]:
comments_val['predicted_spans'] = comments_val.apply(pred_to_spans, axis=1, result_type='expand')

test_gold_spans = pd.DataFrame((comments_val['gold_spans'].explode().dropna().tolist()))
test_baseline_spans = pd.DataFrame(comments_val.apply(convert_spans, axis=1).explode().dropna().tolist())
print(f"F1 on ES data before postprocessing {fine_grained_flausch_by_label(test_gold_spans, test_baseline_spans)['TOTAL']['STRICT']}")

F1 on ES data before postprocessing {'prec': 0.7364043506078055, 'rec': 0.7587343441001978, 'f1': 0.7474025974025974}


In [19]:
def build_spans_from_classification(tokens, classification, offset_mapping):
    """Modified version to work with string tokens and offset mapping"""
    res = []
    searching_end = False
    temp_res = []
    trunc_count = 0
    skip_count = 0

    for i, el in enumerate(classification):
        # Skip special tokens like CLS, SEP
        if i >= len(offset_mapping) or offset_mapping[i][0] is None:
            continue

        token_start, token_end = offset_mapping[i]

        if el == 'O' and searching_end is True:
            if i != 0 and i != len(tokens) - 1 and tokens[i].startswith('##'):
                continue
            if i > 0:
                prev_end = offset_mapping[i-1][1]
                temp_res[1] = prev_end
            else:
                temp_res[1] = -1
            res.append(temp_res)
            searching_end = False

        elif el.startswith('B-'):
            if i != 0 and i != len(tokens) - 1 and tokens[i].startswith('##'):
                continue
            if searching_end is True:
                if i > 0:
                    prev_end = offset_mapping[i-1][1]
                    temp_res[1] = prev_end
                else:
                    temp_res[1] = -1
                res.append(temp_res)
                trunc_count += 1
            split = el.split('-', 1)
            label_type = split[1]
            temp_res = [token_start, -1, label_type, ""]  # Changed structure
            searching_end = True

        elif el.startswith('I-'):
            if searching_end is True and i != 0 and i != len(tokens) - 1 and tokens[i].startswith('##'):
                continue
            split = el.split('-', 1)
            label_type = split[1]
            if searching_end is True and label_type != temp_res[2]:
                if i > 0:
                    prev_end = offset_mapping[i-1][1]
                    temp_res[1] = prev_end
                else:
                    temp_res[1] = -1
                res.append(temp_res)
                searching_end = False
                trunc_count += 1
            elif searching_end is False:
                skip_count += 1

    if searching_end is True and len(offset_mapping) > 1:
        temp_res[1] = offset_mapping[-1][1]
        res.append(temp_res)

    return res, skip_count, trunc_count

def apply_span_classification(row):
    """Apply classification to the tokens and return spans."""
    tokens = ['[CLS]', *row['text_tokens'], '[SEP]']
    classification = row['predicted_labels']
    offset_mapping = row['offset_mapping']

    # Convert classification to BIO format
    bio_labels = [classifier.id2label[label] for label in classification]

    print(len(tokens), len(bio_labels), len(offset_mapping))
    spans, skip_count, trunc_count = build_spans_from_classification(tokens, bio_labels, offset_mapping)

    document = row['document']
    comment_id = row['comment_id']

    return [{'document': document, 'comment_id': comment_id, 'type': span[2], 'start': span[0], 'end': span[1]}
            for span in spans if span[0] != -1 and span[1] != -1]

ge2017_rules_test_pred_spans = pd.DataFrame(comments_val.apply(apply_span_classification, axis=1).explode().dropna().tolist())

8 8 8
19 19 19
13 13 13
53 53 53
33 33 33
28 28 28
9 9 9
63 63 63
18 18 18
64 64 64
92 92 92
3 3 3
4 4 4
14 14 14
15 15 15
12 12 12
8 8 8
14 14 14
59 59 59
3 3 3
7 7 7
25 25 25
8 8 8
9 9 9
28 28 28
8 8 8
14 14 14
28 28 28
4 4 4
44 44 44
18 18 18
7 7 7
11 11 11
16 16 16
19 19 19
4 4 4
54 54 54
4 4 4
17 17 17
8 8 8
14 14 14
6 6 6
7 7 7
19 19 19
15 15 15
46 46 46
20 20 20
29 29 29
22 22 22
18 18 18
18 18 18
13 13 13
35 35 35
9 9 9
14 14 14
86 86 86
7 7 7
6 6 6
25 25 25
7 7 7
48 48 48
6 6 6
69 69 69
9 9 9
4 4 4
7 7 7
33 33 33
18 18 18
48 48 48
36 36 36
8 8 8
5 5 5
25 25 25
4 4 4
14 14 14
6 6 6
10 10 10
16 16 16
32 32 32
30 30 30
10 10 10
24 24 24
18 18 18
12 12 12
6 6 6
14 14 14
140 140 140
8 8 8
8 8 8
8 8 8
6 6 6
8 8 8
10 10 10
18 18 18
9 9 9
5 5 5
28 28 28
10 10 10
9 9 9
5 5 5
23 23 23
4 4 4
7 7 7
5 5 5
31 31 31
9 9 9
20 20 20
4 4 4
12 12 12
16 16 16
8 8 8
9 9 9
19 19 19
21 21 21
5 5 5
8 8 8
4 4 4
3 3 3
7 7 7
11 11 11
18 18 18
27 27 27
6 6 6
3 3 3
22 22 22
17 17 17
10 10 10
5 5 5
63 63 6

In [20]:
print(f"F1 on ES data before postprocessing      {fine_grained_flausch_by_label(test_gold_spans, test_baseline_spans)['TOTAL']['STRICT']}")
print(f"F1 on ES data with GE2017 postprocessing {fine_grained_flausch_by_label(test_gold_spans, ge2017_rules_test_pred_spans)['TOTAL']['STRICT']}")

F1 on ES data before postprocessing      {'prec': 0.7364043506078055, 'rec': 0.7587343441001978, 'f1': 0.7474025974025974}
F1 on ES data with GE2017 postprocessing {'prec': 0.7312859884836852, 'rec': 0.7534607778510217, 'f1': 0.7422077922077922}


In [21]:
test_comments = test_data

test_comments['predicted_labels'] = None
test_comments['predicted_probs'] = None
test_comments['offset_mapping'] = None
test_comments['text_tokens'] = None

for idx in range(len(test_comments)): #range(15):
    row = test_comments.iloc[idx]
    text = row['comment']
    key = (row['document'], row['comment_id'])

    text_tokens = classifier.tokenizer.tokenize(text)
    test_comments.at[idx, 'text_tokens'] = text_tokens

    device = next(classifier.model.parameters()).device
    inputs = classifier.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, return_offsets_mapping=True)

    offset_mapping = inputs.pop('offset_mapping')
    test_comments.at[idx, 'offset_mapping'] = offset_mapping.cpu().numpy()[0].tolist()
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Vorhersage
    with torch.no_grad():
        outputs = classifier.model(**inputs)

    predicted_labels = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    predicted_probs = torch.nn.functional.softmax(outputs.logits, dim=2)[0].cpu().numpy()
    test_comments.at[idx, 'predicted_labels'] = predicted_labels
    test_comments.at[idx, 'predicted_probs'] = predicted_probs

In [22]:
test_comments

Unnamed: 0,document,comment_id,comment,predicted_labels,predicted_probs,offset_mapping,text_tokens
0,NDY-004,1,Lol i love lochis,"[0, 0, 0, 0, 0, 0, 0, 0]","[[0.99999654, 1.7456429e-07, 1.6115715e-07, 1....","[[0, 0], [0, 1], [1, 3], [4, 5], [6, 10], [11,...","[▁L, ol, ▁i, ▁love, ▁loc, his]"
1,NDY-004,2,ihr singt voll gut :),"[0, 2, 12, 12, 12, 12, 12, 0]","[[0.9999976, 1.1218729e-07, 1.239344e-07, 1.50...","[[0, 0], [0, 3], [4, 8], [8, 9], [10, 14], [15...","[▁ihr, ▁sing, t, ▁voll, ▁gut, ▁:)]"
2,NDY-004,3,Junge fick dich,"[0, 0, 0, 0, 0, 0]","[[0.9999981, 5.8623616e-08, 1.05891374e-07, 1....","[[0, 0], [0, 4], [4, 5], [6, 10], [11, 15], [0...","[▁Jung, e, ▁fick, ▁dich]"
3,NDY-004,4,Ihr seit die besten,"[0, 3, 13, 13, 13, 0]","[[0.99999774, 1.6417343e-07, 1.384722e-07, 1.1...","[[0, 0], [0, 3], [4, 8], [9, 12], [13, 19], [0...","[▁Ihr, ▁seit, ▁die, ▁besten]"
4,NDY-004,5,ihr seit die ALLER besten ich finde euch soooo...,"[0, 3, 13, 13, 13, 13, 13, 3, 13, 13, 13, 13, ...","[[0.99999785, 1.2960982e-07, 1.4320104e-07, 1....","[[0, 0], [0, 3], [4, 8], [9, 12], [13, 17], [1...","[▁ihr, ▁seit, ▁die, ▁ALLE, R, ▁besten, ▁ich, ▁..."
...,...,...,...,...,...,...,...
9224,NDY-203,522,hihi kannst du mich grüßen 💕 👋 😍 Achso wusstes...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 11, 0, 11, 11, ...","[[0.99999774, 1.8107521e-07, 1.0220851e-07, 9....","[[0, 0], [0, 4], [5, 11], [12, 14], [15, 19], ...","[▁hihi, ▁kannst, ▁du, ▁mich, ▁gr, üß, en, ▁, 💕..."
9225,NDY-203,523,#Glocke aktiviert 👑 Ich liebe deine Videos 💍 💎...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 11, 11, 11, 11,...","[[0.9999976, 1.1908668e-07, 8.492378e-08, 6.60...","[[0, 0], [0, 1], [1, 2], [2, 6], [6, 7], [8, 1...","[▁#, G, lock, e, ▁aktiv, iert, ▁, 👑, ▁Ich, ▁li..."
9226,NDY-203,524,Bist die beste ❤ Bitte Grüße mich 💕 ❤ 😘 😍,"[0, 3, 13, 13, 13, 13, 0, 0, 0, 1, 1, 11, 11, ...","[[0.9999974, 2.1362885e-07, 1.2580301e-07, 9.5...","[[0, 0], [0, 3], [3, 4], [5, 8], [9, 14], [15,...","[▁Bis, t, ▁die, ▁beste, ▁❤, ▁Bitte, ▁Grüße, ▁m..."
9227,NDY-203,525,"Hi Bonny ❤️ War letztens auf'm Flughafen , und...","[0, 0, 0, 0, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.99999523, 6.63842e-07, 2.0147786e-07, 1.16...","[[0, 0], [0, 2], [3, 6], [6, 8], [9, 10], [10,...","[▁Hi, ▁Bon, ny, ▁❤, ️, ▁War, ▁letzten, s, ▁auf..."


In [38]:
test_comments['predicted_spans'] = test_comments.apply(pred_to_spans, axis=1, result_type='expand')
test_comments_spans = pd.DataFrame(test_comments.apply(convert_spans, axis=1).explode().dropna().tolist())

In [40]:
test_comments_spans

Unnamed: 0,document,comment_id,type,start,end
0,NDY-004,2,compliment,0,21
1,NDY-004,4,affection declaration,0,19
2,NDY-004,5,affection declaration,0,25
3,NDY-004,5,affection declaration,26,56
4,NDY-004,5,positive feedback,57,71
...,...,...,...,...,...
5498,NDY-203,526,affection declaration,0,17
5499,NDY-203,526,positive feedback,30,59
5500,NDY-203,526,positive feedback,64,104
5501,NDY-203,526,affection declaration,105,106


In [48]:
test_comments_spans.to_csv("./submissions/task2-predicted.csv", index=False)

In [49]:
!head -n 10 ./submissions/task2-predicted.csv

document,comment_id,type,start,end
NDY-004,2,compliment,0,21
NDY-004,4,affection declaration,0,19
NDY-004,5,affection declaration,0,25
NDY-004,5,affection declaration,26,56
NDY-004,5,positive feedback,57,71
NDY-004,5,affection declaration,72,87
NDY-004,6,affection declaration,0,17
NDY-004,8,implicit,0,46
NDY-004,16,compliment,0,29


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [43]:
!cp './submissions/task2-predicted.csv' './submissions/subtask2_submission2.csv'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [46]:
!head -n 10  './submissions/subtask2_submission1.csv'

document,comment_id,type,start,end
NDY-004,1,affection declaration,0,17
NDY-004,2,compliment,0,21
NDY-004,4,affection declaration,0,19
NDY-004,5,affection declaration,0,25
NDY-004,5,affection declaration,26,56
NDY-004,5,positive feedback,57,71
NDY-004,5,affection declaration,72,87
NDY-004,6,affection declaration,0,17
NDY-004,8,implicit,0,46


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [56]:
test_comments_spans = pd.read_csv("./submissions/task2-predicted.csv")

In [57]:
test_comments_spans

Unnamed: 0,document,comment_id,type,start,end
0,NDY-004,2,compliment,0,21
1,NDY-004,4,affection declaration,0,19
2,NDY-004,5,affection declaration,0,25
3,NDY-004,5,affection declaration,26,56
4,NDY-004,5,positive feedback,57,71
...,...,...,...,...,...
5498,NDY-203,526,affection declaration,0,17
5499,NDY-203,526,positive feedback,30,59
5500,NDY-203,526,positive feedback,64,104
5501,NDY-203,526,affection declaration,105,106


In [58]:
test_comments

Unnamed: 0,document,comment_id,comment,predicted_labels,predicted_probs,offset_mapping,text_tokens,predicted_spans
0,NDY-004,1,Lol i love lochis,"[0, 0, 0, 0, 0, 0, 0, 0]","[[0.99999654, 1.7456429e-07, 1.6115715e-07, 1....","[[0, 0], [0, 1], [1, 3], [4, 5], [6, 10], [11,...","[▁L, ol, ▁i, ▁love, ▁loc, his]",[]
1,NDY-004,2,ihr singt voll gut :),"[0, 2, 12, 12, 12, 12, 12, 0]","[[0.9999976, 1.1218729e-07, 1.239344e-07, 1.50...","[[0, 0], [0, 3], [4, 8], [8, 9], [10, 14], [15...","[▁ihr, ▁sing, t, ▁voll, ▁gut, ▁:)]","[{'type': 'compliment', 'start': 0, 'end': 21,..."
2,NDY-004,3,Junge fick dich,"[0, 0, 0, 0, 0, 0]","[[0.9999981, 5.8623616e-08, 1.05891374e-07, 1....","[[0, 0], [0, 4], [4, 5], [6, 10], [11, 15], [0...","[▁Jung, e, ▁fick, ▁dich]",[]
3,NDY-004,4,Ihr seit die besten,"[0, 3, 13, 13, 13, 0]","[[0.99999774, 1.6417343e-07, 1.384722e-07, 1.1...","[[0, 0], [0, 3], [4, 8], [9, 12], [13, 19], [0...","[▁Ihr, ▁seit, ▁die, ▁besten]","[{'type': 'affection declaration', 'start': 0,..."
4,NDY-004,5,ihr seit die ALLER besten ich finde euch soooo...,"[0, 3, 13, 13, 13, 13, 13, 3, 13, 13, 13, 13, ...","[[0.99999785, 1.2960982e-07, 1.4320104e-07, 1....","[[0, 0], [0, 3], [4, 8], [9, 12], [13, 17], [1...","[▁ihr, ▁seit, ▁die, ▁ALLE, R, ▁besten, ▁ich, ▁...","[{'type': 'affection declaration', 'start': 0,..."
...,...,...,...,...,...,...,...,...
9224,NDY-203,522,hihi kannst du mich grüßen 💕 👋 😍 Achso wusstes...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 11, 0, 11, 11, ...","[[0.99999774, 1.8107521e-07, 1.0220851e-07, 9....","[[0, 0], [0, 4], [5, 11], [12, 14], [15, 19], ...","[▁hihi, ▁kannst, ▁du, ▁mich, ▁gr, üß, en, ▁, 💕...","[{'type': 'positive feedback', 'start': 27, 'e..."
9225,NDY-203,523,#Glocke aktiviert 👑 Ich liebe deine Videos 💍 💎...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 11, 11, 11, 11,...","[[0.9999976, 1.1908668e-07, 8.492378e-08, 6.60...","[[0, 0], [0, 1], [1, 2], [2, 6], [6, 7], [8, 1...","[▁#, G, lock, e, ▁aktiv, iert, ▁, 👑, ▁Ich, ▁li...","[{'type': 'positive feedback', 'start': 20, 'e..."
9226,NDY-203,524,Bist die beste ❤ Bitte Grüße mich 💕 ❤ 😘 😍,"[0, 3, 13, 13, 13, 13, 0, 0, 0, 1, 1, 11, 11, ...","[[0.9999974, 2.1362885e-07, 1.2580301e-07, 9.5...","[[0, 0], [0, 3], [3, 4], [5, 8], [9, 14], [15,...","[▁Bis, t, ▁die, ▁beste, ▁❤, ▁Bitte, ▁Grüße, ▁m...","[{'type': 'affection declaration', 'start': 0,..."
9227,NDY-203,525,"Hi Bonny ❤️ War letztens auf'm Flughafen , und...","[0, 0, 0, 0, 1, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0.99999523, 6.63842e-07, 2.0147786e-07, 1.16...","[[0, 0], [0, 2], [3, 6], [6, 8], [9, 10], [10,...","[▁Hi, ▁Bon, ny, ▁❤, ️, ▁War, ▁letzten, s, ▁auf...","[{'type': 'positive feedback', 'start': 9, 'en..."


In [60]:
test_comments['has_spans'] = test_comments.apply(lambda x: len(x['predicted_spans']) > 0, axis=1)

In [63]:
test_comments['flausch'] = test_comments['has_spans'].map({True: 'yes', False: 'no'})

In [66]:
test_comments[["document","comment_id","flausch"]].to_csv(f'./submissions/task1-predicted.csv', index=False)

In [68]:
!cp './submissions/task1-predicted.csv' './submissions/subtask1_submission2.csv'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [70]:
!head -n 10 './submissions/task1-predicted.csv'

document,comment_id,flausch
NDY-004,1,no
NDY-004,2,yes
NDY-004,3,no
NDY-004,4,yes
NDY-004,5,yes
NDY-004,6,yes
NDY-004,7,no
NDY-004,8,yes
NDY-004,9,no


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
