import csv
import numpy as np
import ROOT
ROOT.RooMsgService.instance().setGlobalKillBelow(5)
%jsroot on
The data has three categories of entries: Derived variables (those that have a physics meaning), principle variables (those that are used to create the derived variables), and event info comprised of event weights, and classification (signal or background).
inputfile = 'input/training.csv'
all = list(csv.reader(open(inputfile,"rb"), delimiter=','))
headers = all[0]
print 'using the derived variables:',headers[1:14]
print ''
print 'or the principle variables:',headers[14:-2]
print ''
print 'with weights and classification at:',headers[-2:]
In this example I choose to look at the MMC mass since it gives the best separation of signal and background.
I also scale the signal by 10 such that it is visible in the plots.
all = list(csv.reader(open(inputfile,"rb"), delimiter=','))
VBF = [variables for variables in all[1:] if (float(variables[6]) > 200 and float(variables[5]) > 2) ]
VBF_Loose = [variables for variables in all[1:] if (float(variables[6]) > 180 and float(variables[5]) > 1.8) ]
VBF_Tight = [variables for variables in all[1:] if (float(variables[6]) > 220 and float(variables[5]) > 2.2) ]
In order to perform statistical tests on this code using the RooStats suite of libraries, first we need to create a workspace. A workspace should contain the statistical model (including any defining parameters such as the normalisations), a pdf of the expected distribution, a parameter of interest (as included in the model), and data to compare this model to.
The first step is to move these distributions from vectors to histograms that can be read by RooFit.
Let's now define a signal region which is just this MMC mass between 100 and 150 GeV
SR = [variables for variables in VBF[1:] if (float(variables[1]) > 100 and float(variables[1]) < 150) ]
SR_Loose = [variables for variables in VBF_Loose[1:] if (float(variables[1]) > 100 and float(variables[1]) < 150) ]
SR_Tight = [variables for variables in VBF_Tight[1:] if (float(variables[1]) > 100 and float(variables[1]) < 150) ]
def get_var(region, region_name=None, h_params=[5,100,150]):
signal = [float(row[1]) for row in region[1:] if row[-1] == 's' ]
sigweights = [float(row[-2])*10 for row in region[1:] if row[-1] == 's' ]
background = [float(row[1]) for row in region[1:] if row[-1] == 'b' ]
bkgweights = [float(row[-2]) for row in region[1:] if row[-1] == 'b' ]
h3 = ROOT.TH1D("{} data".format(region_name),"data histogram",h_params[0],h_params[1],h_params[2])
h1 = ROOT.TH1D("{} signal".format(region_name),"signal histogram",h_params[0],h_params[1],h_params[2])
for i in range(len(signal)):
h1.Fill(signal[i],sigweights[i])
h3.Fill(signal[i],sigweights[i])
h2 = ROOT.TH1D("{} background".format(region_name),"background histogram",h_params[0],h_params[1],h_params[2])
for i in range(len(background)):
h2.Fill(background[i],bkgweights[i])
h3.Fill(background[i],bkgweights[i])
return h3, h1, h2, sum(sigweights), sum(bkgweights)
data_hist, sig_hist, bkg_hist, ns, nb = get_var(SR,"SR", [5,100,150])
data_up_hist, sig_up_hist, bkg_up_hist, nsup, nbup = get_var(SR_Loose,"SR_Loose", [5,100,150])
data_down_hist, sig_down_hist, bkg_down_hist, nsdown, nbdown = get_var(SR_Tight,"SR_Tight", [5,100,150])
print 'n signal {0}(+{1}%,{2}%)'.format(ns,nsup/ns*100-100,nsdown/ns*100-100)
print 'n background {0}(+{1}%,{2}%)'.format(nb,nbup/nb*100-100,nbdown/nb*100-100)
First we set the Parameter of interest, and several constant parameters.
meas = ROOT.RooStats.HistFactory.Measurement("meas", "meas")
meas.SetPOI( "SigXsecOverSM" )
We don't include a specific Luminocity here, but since HistFactory gives it to us for free, we set it constant with a dummy value of 2% (as fairly standard for events with ATLAS)
meas.SetLumi( 1.0 )
meas.SetLumiRelErr( 0.02 )
meas.AddConstantParam("Lumi")
Create a channel and set the measured value of data (no extenal histogram is necessarry for cut-and-count)
Here we use an, 'asimov' dataset which is the nominal signal+background model.
chan = ROOT.RooStats.HistFactory.Channel( "SR" )
chan.SetStatErrorConfig(0.05, "Poisson")
chan.SetData( data_hist )
signal = ROOT.RooStats.HistFactory.Sample( "signal" )
signal.SetNormalizeByTheory( False )
signal.SetHisto( sig_hist )
Add the parmaeter of interest and a systematic and try to make intelligent choice of upper bound
signal.AddNormFactor( "SigXsecOverSM", 1, 0, 3)
Since we have a signal normalisation uncertainty, we can add it too
uncertainty_up = nsup/ns
uncertainty_down = nsdown/ns
signal.AddOverallSys( "signal_norm_uncertainty", uncertainty_down, uncertainty_up )
and a signal shape uncertainty
signal_shape = ROOT.RooStats.HistFactory.HistoSys("signal_shape")
signal_shape.SetHistoHigh( sig_up_hist )
signal_shape.SetHistoLow( sig_down_hist )
signal.AddHistoSys( signal_shape )
Finally, we add this sample to the channel
chan.AddSample( signal )
background = ROOT.RooStats.HistFactory.Sample( "background" )
background.SetNormalizeByTheory( False )
background.SetHisto( bkg_hist )
uncertainty_up = nbup/nb
uncertainty_down = nbdown/nb
background.AddOverallSys( "background_uncertainty", uncertainty_down, uncertainty_up )
background_shape = ROOT.RooStats.HistFactory.HistoSys("background_shape")
background_shape.SetHistoHigh( bkg_up_hist )
background_shape.SetHistoLow( bkg_down_hist )
background.AddHistoSys( background_shape )
chan.AddSample( background )
meas.AddChannel(chan)
hist2workspace = ROOT.RooStats.HistFactory.HistoToWorkspaceFactoryFast(meas)
workspace = hist2workspace.MakeSingleChannelModel(meas, chan)
workspace.SetName('HistFactoryWorkspace')
workspace.writeToFile("output/HistFactoryWorkspace.root")
Here the CR is the region around the Z peak. 80<mmc<100
CR = [variables for variables in VBF[1:] if (float(variables[1]) < 100 and float(variables[1]) > 80) ]
CR_Loose = [variables for variables in VBF_Loose[1:] if (float(variables[1]) < 100 and float(variables[1]) > 80) ]
CR_Tight = [variables for variables in VBF_Tight[1:] if (float(variables[1]) < 100 and float(variables[1]) > 80) ]
h_params = [1,80,100]
data_cr, sig_cr, bkg_cr, ns, nb = get_var(CR,"CR",h_params)
data_up_cr, sig_up_cr, bkg_up_cr, nsup, nbup = get_var(CR_Loose,"CR_Loose",h_params)
data_down_cr, sig_down_cr, bkg_down_cr, nsdown, nbdown = get_var(CR_Tight,"CR_Tight",h_params)
print 'n signal {0}(+{1}%,{2}%)'.format(ns,nsup/ns*100-100,nsdown/ns*100-100)
print 'n background {0}(+{1}%,{2}%)'.format(nb,nbup/nb*100-100,nbdown/nb*100-100)
chan = ROOT.RooStats.HistFactory.Channel( "CR" )
chan.SetStatErrorConfig(0.05, "Poisson")
chan.SetData( data_cr )
signal = ROOT.RooStats.HistFactory.Sample( "signal" )
signal.SetNormalizeByTheory( False )
signal.SetHisto( sig_cr )
uncertainty_up = nsup/ns
uncertainty_down = nsdown/ns
signal.AddOverallSys( "signal_norm_uncertainty", uncertainty_down, uncertainty_up )
signal_shape = ROOT.RooStats.HistFactory.HistoSys("background_shape")
signal_shape.SetHistoHigh( sig_up_cr )
signal_shape.SetHistoLow( sig_down_cr )
background.AddHistoSys( background_shape )
chan.AddSample( signal )
background = ROOT.RooStats.HistFactory.Sample( "background" )
background.SetNormalizeByTheory( False )
background.SetHisto( bkg_cr )
uncertainty_up = nbup/nb
uncertainty_down = nbdown/nb
background.AddOverallSys( "background_uncertainty", uncertainty_down, uncertainty_up )
background_shape = ROOT.RooStats.HistFactory.HistoSys("background_shape")
background_shape.SetHistoHigh( bkg_up_cr )
background_shape.SetHistoLow( bkg_down_cr )
background.AddHistoSys( background_shape )
chan.AddSample( background )
meas.AddChannel(chan)
hist2workspace = ROOT.RooStats.HistFactory.HistoToWorkspaceFactoryFast(meas)
workspace = hist2workspace.MakeCombinedModel(meas)
workspace.SetName('HistFactoryWorkspace')
workspace.writeToFile("output/HistFactoryCombinedWorkspace.root")
infile = ROOT.TFile.Open("output/HistFactoryCombinedWorkspace.root")
w = infile.Get("HistFactoryWorkspace")
mc = w.obj("ModelConfig")
data = w.data("obsData")
x = w.var("obs_x_SR")
pl = ROOT.RooStats.ProfileLikelihoodCalculator(data,mc)
pl.SetConfidenceLevel(0.99)
interval = pl.GetInterval()
plot = ROOT.RooStats.LikelihoodIntervalPlot(interval)
plot.SetNPoints(50)
plot.SetMaximum(5)
c = ROOT.TCanvas()
plot.Draw()
c.Draw()