Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,8 @@ def train(self, importance_cutoff=0.15, limit=None):
logger.info(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

self.clf.fit(X_train, self.le.transform(y_train))
logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)
if hasattr(self.clf.steps[-1][1], "n_features_in_"):
logger.info("Number of features: %d", self.clf.steps[-1][1].n_features_in_)

logger.info("Model trained")

Expand Down
2 changes: 2 additions & 0 deletions bugbug/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
"bugtype": "bugbug.models.bugtype.BugTypeModel",
"component": "bugbug.models.component.ComponentModel",
"defect": "bugbug.models.defect.DefectModel",
"defect_finetuning": "bugbug.models.defect.DefectFinetuningModel",
"defect_embedding": "bugbug.models.defect.DefectEmbeddingModel",
"defectenhancementtask": "bugbug.models.defect_enhancement_task.DefectEnhancementTaskModel",
"devdocneeded": "bugbug.models.devdocneeded.DevDocNeededModel",
"fixtime": "bugbug.models.fixtime.FixTimeModel",
Expand Down
98 changes: 98 additions & 0 deletions bugbug/models/defect.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,22 @@
import logging
from typing import Any

import torch
import xgboost
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImblearnPipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from skorch import NeuralNetClassifier
from skorch.callbacks import ProgressBar
from skorch.hf import HuggingfacePretrainedTokenizer
from torch import nn

from bugbug import bug_features, bugzilla, feature_cleanup, labels, utils
from bugbug.model import BugModel
from bugbug.nn import DistilBertModule, ExtractEmbeddings, get_training_device
from bugbug.utils import MergeText

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -290,3 +297,94 @@ def overwrite_classes(self, bugs, classes, probabilities):
classes[i] = 0 if not probabilities else [1.0, 0.0]

return classes


class DefectFinetuningModel(DefectModel):
def __init__(self, last_layer_only=True, **kwargs):
super().__init__(**kwargs)

self.sampler = None
self.calculate_importance = False
self.cross_validation_enabled = False

self.extraction_pipeline = Pipeline(
[
(
"bug_extractor",
bug_features.BugExtractor([], [], rollback=True),
),
("extract", MergeText(["title", "comments"])),
]
)

self.clf = Pipeline(
[
(
"tokenizer",
HuggingfacePretrainedTokenizer(
"distilbert-base-uncased", max_length=512
),
),
(
"classifier",
NeuralNetClassifier(
DistilBertModule,
module__name="distilbert-base-uncased",
module__num_labels=2,
module__last_layer_only=last_layer_only,
optimizer=torch.optim.AdamW,
lr=6e-5,
max_epochs=2,
criterion=nn.CrossEntropyLoss,
batch_size=4,
iterator_train__shuffle=True,
device=get_training_device(),
callbacks=[
ProgressBar(),
],
),
),
]
)

def get_feature_names(self):
return []


class DefectEmbeddingModel(DefectModel):
def __init__(self, **kwargs):
print(**kwargs)
super().__init__(**kwargs)

self.sampler = None
self.calculate_importance = False
self.cross_validation_enabled = False

self.extraction_pipeline = Pipeline(
[
(
"bug_extractor",
bug_features.BugExtractor([], [], rollback=True),
),
("extract", MergeText(["title", "comments"])),
]
)

self.clf = Pipeline(
[
(
"tokenizer",
HuggingfacePretrainedTokenizer(
"distilbert-base-uncased", max_length=512
),
),
("extract_embeddings", ExtractEmbeddings("distilbert-base-uncased")),
(
"classifier",
xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()),
),
]
)

def get_feature_names(self):
return []
52 changes: 52 additions & 0 deletions bugbug/nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import torch
from sklearn.base import BaseEstimator, TransformerMixin
from torch import nn
from transformers import AutoModel, AutoModelForSequenceClassification

OPT_MSG_MISSING = (
"Optional dependencies are missing, install them with: pip install bugbug[nn]\n"
)


class ExtractEmbeddings(BaseEstimator, TransformerMixin):
def __init__(self, model_name: str):
self.model = AutoModel.from_pretrained(model_name)

def fit(self, X, y):
return self

def transform(self, X):
with torch.no_grad():
# TODO: support .last_hidden_state.mean(dim=1) as an alternative
return self.model(**X).last_hidden_state[:, 0, :]


def get_training_device() -> str:
return "cuda" if torch.cuda.is_available() else "cpu"


class DistilBertModule(nn.Module):
def __init__(self, name, num_labels, last_layer_only=False):
super().__init__()
self.name = name
self.num_labels = num_labels
self.last_layer_only = last_layer_only

self.reset_weights()

def reset_weights(self):
self.bert = AutoModelForSequenceClassification.from_pretrained(
self.name, num_labels=self.num_labels
)
if self.last_layer_only:
for param in self.bert.distilbert.parameters():
param.requires_grad = False

def forward(self, **kwargs):
pred = self.bert(**kwargs)
return pred.logits
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ dependencies = [
"requests==2.33.0",
"requests-html==0.10.0",
"rs-parsepatch==0.4.6",
"skorch==1.3.1",
"torch==2.11.0",
"transformers==5.4.0",
"scikit-learn==1.7.2",
"scipy==1.17.1",
"sendgrid==6.12.5",
Expand All @@ -70,7 +73,6 @@ dependencies = [
nlp = [
"spacy==3.8.13",
]
nn = []

[dependency-groups]
test = [
Expand Down
6 changes: 4 additions & 2 deletions scripts/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ def go(self, args):

model_name = args.model
model_class = get_model_class(model_name)
parameter_names = set(inspect.signature(model_class.__init__).parameters)
parameter_names = set(inspect.signature(model_class.__init__).parameters) - {
"kwargs"
}
parameters = {
key: value for key, value in vars(args).items() if key in parameter_names
}
Expand All @@ -47,7 +49,7 @@ def go(self, args):

logger.info("Training done")

model_directory = f"{model_name}model"
model_directory = model_obj.__class__.__name__.lower()
assert os.path.exists(model_directory)
create_tar_zst(f"{model_directory}.tar.zst")

Expand Down
55 changes: 54 additions & 1 deletion tests/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import re

import responses

from bugbug import bugzilla, db
from scripts import trainer


def test_trainer():
# Test xgboost model on TF-IDF
def test_trainer_simple():
# Pretend the DB was already downloaded and no new DB is available.

url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_bugs.latest/artifacts/public/bugs.json"
Expand All @@ -29,3 +32,53 @@ def test_trainer():
)

trainer.Trainer().go(trainer.parse_args(["regression"]))


# Test finetuning of transformer model
def test_trainer_finetuning():
responses.add_passthru(re.compile(r"https://.*\.?huggingface.co/\w+"))

# Pretend the DB was already downloaded and no new DB is available.

url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_bugs.latest/artifacts/public/bugs.json"

responses.add(
responses.GET,
f"{url}.version",
status=200,
body=str(db.DATABASES[bugzilla.BUGS_DB]["version"]),
)

responses.add(
responses.HEAD,
f"{url}.zst",
status=200,
headers={"ETag": "etag"},
)

trainer.Trainer().go(trainer.parse_args(["defect_finetuning"]))


# Test xgboost model on transformed model's embeddings
def test_trainer_embedding():
responses.add_passthru(re.compile(r"https://.*\.?huggingface.co/\w+"))

# Pretend the DB was already downloaded and no new DB is available.

url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_bugs.latest/artifacts/public/bugs.json"

responses.add(
responses.GET,
f"{url}.version",
status=200,
body=str(db.DATABASES[bugzilla.BUGS_DB]["version"]),
)

responses.add(
responses.HEAD,
f"{url}.zst",
status=200,
headers={"ETag": "etag"},
)

trainer.Trainer().go(trainer.parse_args(["defect_embedding"]))
Loading