mozilla · marco-c · Oct 27, 2023 · Oct 27, 2023 · Nov 24, 2023 · Nov 24, 2023
diff --git a/bugbug/model.py b/bugbug/model.py
@@ -415,7 +415,8 @@ def train(self, importance_cutoff=0.15, limit=None):
         logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
 
         self.clf.fit(X_train, self.le.transform(y_train))
-        logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)
+        if hasattr(self.clf.steps[-1][1], "n_features_in_"):
+            logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)
 
         logger.info("Model trained")
 

diff --git a/bugbug/models/__init__.py b/bugbug/models/__init__.py
@@ -17,6 +17,8 @@
     "bugtype": "bugbug.models.bugtype.BugTypeModel",
     "component": "bugbug.models.component.ComponentModel",
     "defect": "bugbug.models.defect.DefectModel",
+    "defect_finetuning": "bugbug.models.defect.DefectFinetuningModel",
+    "defect_embedding": "bugbug.models.defect.DefectEmbeddingModel",
     "defectenhancementtask": "bugbug.models.defect_enhancement_task.DefectEnhancementTaskModel",
     "devdocneeded": "bugbug.models.devdocneeded.DevDocNeededModel",
     "fixtime": "bugbug.models.fixtime.FixTimeModel",

diff --git a/bugbug/models/defect.py b/bugbug/models/defect.py
@@ -7,15 +7,22 @@
 import logging
 from typing import Any
 
+import torch
 import xgboost
 from imblearn.over_sampling import BorderlineSMOTE
 from imblearn.pipeline import Pipeline as ImblearnPipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline
+from skorch import NeuralNetClassifier
+from skorch.callbacks import ProgressBar
+from skorch.hf import HuggingfacePretrainedTokenizer
+from torch import nn
 
 from bugbug import bug_features, bugzilla, feature_cleanup, labels, utils
 from bugbug.model import BugModel
+from bugbug.nn import DistilBertModule, ExtractEmbeddings, get_training_device
+from bugbug.utils import MergeText
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -290,3 +297,94 @@ def overwrite_classes(self, bugs, classes, probabilities):
                 classes[i] = 0 if not probabilities else [1.0, 0.0]
 
         return classes
+
+
+class DefectFinetuningModel(DefectModel):
+    def __init__(self, last_layer_only=True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.sampler = None
+        self.calculate_importance = False
+        self.cross_validation_enabled = False
+
+        self.extraction_pipeline = Pipeline(
+            [
+                (
+                    "bug_extractor",
+                    bug_features.BugExtractor([], [], rollback=True),
+                ),
+                ("extract", MergeText(["title", "comments"])),
+            ]
+        )
+
+        self.clf = Pipeline(
+            [
+                (
+                    "tokenizer",
+                    HuggingfacePretrainedTokenizer(
+                        "distilbert-base-uncased", max_length=512
+                    ),
+                ),
+                (
+                    "classifier",
+                    NeuralNetClassifier(
+                        DistilBertModule,
+                        module__name="distilbert-base-uncased",
+                        module__num_labels=2,
+                        module__last_layer_only=last_layer_only,
+                        optimizer=torch.optim.AdamW,
+                        lr=6e-5,
+                        max_epochs=2,
+                        criterion=nn.CrossEntropyLoss,
+                        batch_size=4,
+                        iterator_train__shuffle=True,
+                        device=get_training_device(),
+                        callbacks=[
+                            ProgressBar(),
+                        ],
+                    ),
+                ),
+            ]
+        )
+
+    def get_feature_names(self):
+        return []
+
+
+class DefectEmbeddingModel(DefectModel):
+    def __init__(self, **kwargs):
+        print(**kwargs)
+        super().__init__(**kwargs)
+
+        self.sampler = None
+        self.calculate_importance = False
+        self.cross_validation_enabled = False
+
+        self.extraction_pipeline = Pipeline(
+            [
+                (
+                    "bug_extractor",
+                    bug_features.BugExtractor([], [], rollback=True),
+                ),
+                ("extract", MergeText(["title", "comments"])),
+            ]
+        )
+
+        self.clf = Pipeline(
+            [
+                (
+                    "tokenizer",
+                    HuggingfacePretrainedTokenizer(
+                        "distilbert-base-uncased", max_length=512
+                    ),
+                ),
+                ("extract_embeddings", ExtractEmbeddings("distilbert-base-uncased")),
+                (
+                    "classifier",
+                    xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()),
+                ),
+            ]
+        )
+
+    def get_feature_names(self):
+        return []
diff --git a/bugbug/nn.py b/bugbug/nn.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import torch
+from sklearn.base import BaseEstimator, TransformerMixin
+from torch import nn
+from transformers import AutoModel, AutoModelForSequenceClassification
+
+OPT_MSG_MISSING = (
+    "Optional dependencies are missing, install them with: pip install bugbug[nn]\n"
+)
+
+
+class ExtractEmbeddings(BaseEstimator, TransformerMixin):
+    def __init__(self, model_name: str):
+        self.model = AutoModel.from_pretrained(model_name)
+
+    def fit(self, X, y):
+        return self
+
+    def transform(self, X):
+        with torch.no_grad():
+            # TODO: support .last_hidden_state.mean(dim=1) as an alternative
+            return self.model(**X).last_hidden_state[:, 0, :]
+
+
+def get_training_device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+
+
+class DistilBertModule(nn.Module):
+    def __init__(self, name, num_labels, last_layer_only=False):
+        super().__init__()
+        self.name = name
+        self.num_labels = num_labels
+        self.last_layer_only = last_layer_only
+
+        self.reset_weights()
+
+    def reset_weights(self):
+        self.bert = AutoModelForSequenceClassification.from_pretrained(
+            self.name, num_labels=self.num_labels
+        )
+        if self.last_layer_only:
+            for param in self.bert.distilbert.parameters():
+                param.requires_grad = False
+
+    def forward(self, **kwargs):
+        pred = self.bert(**kwargs)
+        return pred.logits
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,6 +52,9 @@ dependencies = [
     "requests==2.33.0",
     "requests-html==0.10.0",
     "rs-parsepatch==0.4.6",
+    "skorch==1.3.1",
+    "torch==2.11.0",
+    "transformers==5.4.0",
     "scikit-learn==1.7.2",
     "scipy==1.17.1",
     "sendgrid==6.12.5",
@@ -70,7 +73,6 @@ dependencies = [
 nlp = [
     "spacy==3.8.13",
 ]
-nn = []
 
 [dependency-groups]
 test = [

diff --git a/scripts/trainer.py b/scripts/trainer.py
@@ -22,7 +22,9 @@ def go(self, args):
 
         model_name = args.model
         model_class = get_model_class(model_name)
-        parameter_names = set(inspect.signature(model_class.__init__).parameters)
+        parameter_names = set(inspect.signature(model_class.__init__).parameters) - {
+            "kwargs"
+        }
         parameters = {
             key: value for key, value in vars(args).items() if key in parameter_names
         }
@@ -47,7 +49,7 @@ def go(self, args):
 
         logger.info("Training done")
 
-        model_directory = f"{model_name}model"
+        model_directory = model_obj.__class__.__name__.lower()
         assert os.path.exists(model_directory)
         create_tar_zst(f"{model_directory}.tar.zst")
 

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
@@ -3,13 +3,16 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
+import re
+
 import responses
 
 from bugbug import bugzilla, db
 from scripts import trainer
 
 
-def test_trainer():
+# Test xgboost model on TF-IDF
+def test_trainer_simple():
     # Pretend the DB was already downloaded and no new DB is available.
 
     url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_bugs.latest/artifacts/public/bugs.json"
@@ -29,3 +32,53 @@ def test_trainer():
     )
 
     trainer.Trainer().go(trainer.parse_args(["regression"]))
+
+
+# Test finetuning of transformer model
+def test_trainer_finetuning():
+    responses.add_passthru(re.compile(r"https://.*\.?huggingface.co/\w+"))
+
+    # Pretend the DB was already downloaded and no new DB is available.
+
+    url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_bugs.latest/artifacts/public/bugs.json"
+
+    responses.add(
+        responses.GET,
+        f"{url}.version",
+        status=200,
+        body=str(db.DATABASES[bugzilla.BUGS_DB]["version"]),
+    )
+
+    responses.add(
+        responses.HEAD,
+        f"{url}.zst",
+        status=200,
+        headers={"ETag": "etag"},
+    )
+
+    trainer.Trainer().go(trainer.parse_args(["defect_finetuning"]))
+
+
+# Test xgboost model on transformed model's embeddings
+def test_trainer_embedding():
+    responses.add_passthru(re.compile(r"https://.*\.?huggingface.co/\w+"))
+
+    # Pretend the DB was already downloaded and no new DB is available.
+
+    url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_bugs.latest/artifacts/public/bugs.json"
+
+    responses.add(
+        responses.GET,
+        f"{url}.version",
+        status=200,
+        body=str(db.DATABASES[bugzilla.BUGS_DB]["version"]),
+    )
+
+    responses.add(
+        responses.HEAD,
+        f"{url}.zst",
+        status=200,
+        headers={"ETag": "etag"},
+    )
+
+    trainer.Trainer().go(trainer.parse_args(["defect_embedding"]))