## Experiment 019-4

SVM mit RBF Kernel, C=5 und Gamma=0.0002

In [2]:
import os
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, make_scorer, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import time
import pickle
import numpy as np
import pandas as pd
import torch
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
from transformers.utils import is_flash_attn_2_available
import wandb
from wandb import AlertLevel


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
os.environ["WANDB_PROJECT"] = "GermEval2025-Substask1"
os.environ["WANDB_LOG_MODEL"] = "false"

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print("CUDA not available, using CPU")

experiment_name = "exp019-4"

testing_mode = False

# Load data
comments = pd.read_csv("./share-GermEval2025-data/Data/training data/comments.csv")
task1 = pd.read_csv("./share-GermEval2025-data/Data/training data/task1.csv")
comments = comments.merge(task1, on=["document", "comment_id"])

# Remove duplicates
df = comments.drop_duplicates(subset=['comment', 'flausch'])
df.reset_index(drop=True, inplace=True)

In [None]:
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

class Qwen3Embedder:
    def __init__(self, model_name='Qwen/Qwen3-Embedding-8B', instruction=None, max_length=1024):
        if instruction is None:
            instruction = 'Classify a given comment as either flausch (a positive, supportive expression) or non-flausch.'
        self.instruction = instruction

        if is_flash_attn_2_available():
            self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16)
        else:
            self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)

        self.model = self.model.cuda()
        self.model.eval()

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side='left')
        self.max_length = max_length

    def get_detailed_instruct(self, query: str) -> str:
        return f'Instruct: {self.instruction}\nQuery:{query}'

    def encode_batch(self, texts, batch_size=32):
        """Encode texts in batches to handle memory efficiently"""
        all_embeddings = []

        for i in range(0, len(texts), batch_size):
            batch_texts = [self.get_detailed_instruct(comment) for comment in texts[i:i + batch_size]]

            # Tokenize batch
            inputs = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors='pt'
            ).to(device)

            # Get embeddings
            with torch.no_grad():
                outputs = self.model(**inputs)
                # Mean pooling
                embeddings = last_token_pool(outputs.last_hidden_state, inputs['attention_mask'])
                #embeddings = embeddings.float()

            all_embeddings.append(embeddings.cpu().numpy())

        # Normalize embeddings (sollte ich?)
        #import torch.nn.functional as F
        #output = F.normalize(all_embeddings, p=2, dim=1)
        return np.vstack(all_embeddings)

# Initialize embedder
print("Loading Qwen3 Embeddings v3...")
embedder = Qwen3Embedder(instruction='Classify a given comment as either flausch (a positive, supportive expression) or non-flausch')

X, y = df["comment"], df["flausch"].map(dict(yes=1, no=0))

# load embeddings if they exist
embeddings_file = f'Qwen3-Embedding-8B-{experiment_name}.npy'
if os.path.exists(embeddings_file):
    print(f"Loading existing embeddings from {embeddings_file}")
    X_embeddings = np.load(embeddings_file)
else:
    print("Embeddings not found, generating new embeddings...")
    # Encode texts in batches to avoid memory issues
    X_embeddings = embedder.encode_batch(X.tolist(), batch_size=64)
    print(f"Generated embeddings with shape: {X_embeddings.shape}")

    # save embeddings to avoid recomputation
    np.save(embeddings_file, X_embeddings)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(random_state=42, C=5, gamma=0.0002, cache_size=2000))
])

f1_pos_scorer = make_scorer(f1_score, pos_label=1, average='binary')

X_train = X_embeddings
y_train = y

pipe.fit(X_train, y_train)

In [3]:
test_data: pd.DataFrame = pd.read_csv("./share-GermEval2025-data/Data/test data/comments.csv")
test_data

Unnamed: 0,document,comment_id,comment
0,NDY-004,1,Lol i love lochis
1,NDY-004,2,ihr singt voll gut :)
2,NDY-004,3,Junge fick dich
3,NDY-004,4,Ihr seit die besten
4,NDY-004,5,ihr seit die ALLER besten ich finde euch soooo...
...,...,...,...
9224,NDY-203,522,hihi kannst du mich gr√º√üen üíï üëã üòç Achso wusstes...
9225,NDY-203,523,#Glocke aktiviert üëë Ich liebe deine Videos üíç üíé...
9226,NDY-203,524,Bist die beste ‚ù§ Bitte Gr√º√üe mich üíï ‚ù§ üòò üòç
9227,NDY-203,525,"Hi Bonny ‚ù§Ô∏è War letztens auf'm Flughafen , und..."


In [6]:
X_test_data = embedder.encode_batch(test_data['comment'].tolist(), batch_size=64)

In [7]:
y_prediction = pipe.predict(X_test_data)

In [11]:
test_data['flausch'] = y_prediction
test_data['flausch'] = test_data['flausch'].map({1: 'yes', 0: 'no'})
test_data

Unnamed: 0,document,comment_id,comment,flausch
0,NDY-004,1,Lol i love lochis,no
1,NDY-004,2,ihr singt voll gut :),yes
2,NDY-004,3,Junge fick dich,no
3,NDY-004,4,Ihr seit die besten,yes
4,NDY-004,5,ihr seit die ALLER besten ich finde euch soooo...,yes
...,...,...,...,...
9224,NDY-203,522,hihi kannst du mich gr√º√üen üíï üëã üòç Achso wusstes...,no
9225,NDY-203,523,#Glocke aktiviert üëë Ich liebe deine Videos üíç üíé...,yes
9226,NDY-203,524,Bist die beste ‚ù§ Bitte Gr√º√üe mich üíï ‚ù§ üòò üòç,yes
9227,NDY-203,525,"Hi Bonny ‚ù§Ô∏è War letztens auf'm Flughafen , und...",yes


In [12]:
test_data[['document', 'comment_id', 'flausch']]

Unnamed: 0,document,comment_id,flausch
0,NDY-004,1,no
1,NDY-004,2,yes
2,NDY-004,3,no
3,NDY-004,4,yes
4,NDY-004,5,yes
...,...,...,...
9224,NDY-203,522,no
9225,NDY-203,523,yes
9226,NDY-203,524,yes
9227,NDY-203,525,yes


In [16]:
test_data[['document', 'comment_id', 'flausch']].to_csv(f'./submissions/subtask1_submission1.csv', index=False)

In [19]:
!head -n 10 './submissions/subtask1_submission1.csv'

document,comment_id,flausch
NDY-004,1,no
NDY-004,2,yes
NDY-004,3,no
NDY-004,4,yes
NDY-004,5,yes
NDY-004,6,yes
NDY-004,7,no
NDY-004,8,no
NDY-004,9,no


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
!cp './submissions/subtask1_submission1.csv' './submissions/task1-predicted.csv'

 Score f√ºr Subtask 1:

 ‚Üí 0.88