import os
import sys
from optparse import OptionParser
import glob
import fnmatch
import re
import ROOT
import json
import time
from datetime import timedelta
import pandas as pd
import numpy as np

#numpy_dic = {}
category = {700587:"Diboson",700588:"Diboson",700589:"Diboson",700590:"Diboson",700591:"Diboson",700592:"Diboson",700593:"Diboson",700594:"Diboson",700600:"Diboson",700601:"Diboson",700602:"Diboson",700603:"Diboson",700604:"Diboson",700605:"Diboson",700488:"Diboson",700489:"Diboson",700490:"Diboson",700491:"Diboson",700492:"Diboson",700493:"Diboson",700494:"Diboson",700495:"Diboson",700496:"Diboson",700199:"Vgamgam",700200:"Vgamgam",700201:"Vgamgam",700195:"Vgamgam",700196:"Vgamgam",700197:"Vgamgam",700198:"Vgamgam",364242:"Triboson",364243:"Triboson",364244:"Triboson",364245:"Triboson",364246:"Triboson",364247:"Triboson",364248:"Triboson",364249:"Triboson",364336:"Triboson",364337:"Triboson",364338:"Triboson",364339:"Triboson",700402:"Vgamma",700403:"Vgamma",700404:"Vgamma",700398:"Vgamma",700399:"Vgamma",700400:"Vgamma",700401:"Vgamma",700709:"Vgamma",700710:"Vgamma",700507:"Vgamma",700352:"Vgamma",700353:"Vgamma",700338:"Wjets",700339:"Wjets",700340:"Wjets",700341:"Wjets",700342:"Wjets",700343:"Wjets",700344:"Wjets",700345:"Wjets",700346:"Wjets",700347:"Wjets",700348:"Wjets",700349:"Wjets",700362:"Wjets",700363:"Wjets",700364:"Wjets",700843:"Wjets",700320:"Zjets",700321:"Zjets",700322:"Zjets",700323:"Zjets",700324:"Zjets",700325:"Zjets",700335:"Zjets",700336:"Zjets",700337:"Zjets",700358:"Zjets",700359:"Zjets",700360:"Zjets",700361:"Zjets",700849:"Zjets",700855:"Zjets",700467:"lowmassDY",700468:"lowmassDY",700469:"lowmassDY",700470:"lowmassDY",700471:"lowmassDY",700472:"lowmassDY",700792:"lowmassDY",700793:"lowmassDY",700794:"lowmassDY",700901:"lowmassDY",700902:"lowmassDY",700903:"lowmassDY",304014:"TopX",410081:"TopX",410155:"TopX",410156:"TopX",410157:"TopX",410218:"TopX",410219:"TopX",410220:"TopX",410408:"TopX",410389:"TopX",410560:"TopX",412043:"TopX",412044:"TopX",410082:"TopX",410084:"TopX",410087:"TopX",410089:"TopX",601352:"singleTop",601355:"singleTop",
                410470:"ttbar",
                410471:"ttbar",
                410472:"ttbar",
                410644:"singleTop",410645:"singleTop",410658:"singleTop",410659:"singleTop",341456:"Higgs",341458:"Higgs",341460:"Higgs",343981:"Higgs",345056:"Higgs",345058:"Higgs",345060:"Higgs",345061:"Higgs",345066:"Higgs",345097:"Higgs",345098:"Higgs",345103:"Higgs",345104:"Higgs",345105:"Higgs",345106:"Higgs",345112:"Higgs",345114:"Higgs",345120:"Higgs",345121:"Higgs",345122:"Higgs",345123:"Higgs",345124:"Higgs",345125:"Higgs",345211:"Higgs",345212:"Higgs",345213:"Higgs",345214:"Higgs",345215:"Higgs",345216:"Higgs",345217:"Higgs",345218:"Higgs",345219:"Higgs",345316:"Higgs",345317:"Higgs",345318:"Higgs",345319:"Higgs",345320:"Higgs",345321:"Higgs",345322:"Higgs",345324:"Higgs",345325:"Higgs",345433:"Higgs",345445:"Higgs",345596:"Higgs",345833:"Higgs",345834:"Higgs",345876:"Higgs",345877:"Higgs",345878:"Higgs",345948:"Higgs",345949:"Higgs",345961:"Higgs",345963:"Higgs",345964:"Higgs",345965:"Higgs",346188:"Higgs",346190:"Higgs",346191:"Higgs",346192:"Higgs",346193:"Higgs",346194:"Higgs",346195:"Higgs",346198:"Higgs",346214:"Higgs",346228:"Higgs",346229:"Higgs",346230:"Higgs",346310:"Higgs",346311:"Higgs",346312:"Higgs",346317:"Higgs",346340:"Higgs",346341:"Higgs",346342:"Higgs",346343:"Higgs",346344:"Higgs",346345:"Higgs",346414:"Higgs",346486:"Higgs",346511:"Higgs",346524:"Higgs",346525:"Higgs",346588:"Higgs",346605:"Higgs",346606:"Higgs",346607:"Higgs",346632:"Higgs",346633:"Higgs",346634:"Higgs",346645:"Higgs",346646:"Higgs",346647:"Higgs",346919:"Higgs",346923:"Higgs",346927:"Higgs",450576:"Higgs"}

def locate(pattern, root_path):
    print(pattern,root_path)
    for path, dirs, files in os.walk(os.path.abspath(root_path)):
        for dirname in fnmatch.filter(dirs, pattern):
            yield os.path.join(path, dirname)

def getListOfSamples():
    csv_files = glob.glob("./input/*.csv")
    samples = []
    for csv in csv_files:
        with open(csv, 'r') as fp:
            lines = fp.readlines()
            nlines = len(lines)
            samples += [lines[i].rstrip().split(",")[1] for i in range(1,nlines)]
    return samples

def getDataPeriods(datayr,rnum):
    """Assign data to the right data taking period based on run number"""
    dataperiods = {
        "data16":{
            "periodA":list(range(297730,300279+1)),
            "periodB":list(range(300345,300908+1)),
            "periodC":list(range(301912,302393+1)),
            "periodD":list(range(302737,303560+1)),
            "periodE":list(range(303638,303892+1)),
            "periodF":list(range(303943,304494+1)),
            "periodG":list(range(305380,306451+1)),
            "PeriodI":list(range(307126,308084+1)),
            "periodK":list(range(309375,309759+1)),
            "periodL":list(range(310015,311481+1))
        },
        "data15":{
            "periodD":list(range(276262,276954+1)),
            "periodE":list(range(278880,279928+1)),
            "periodF":list(range(279932,280368+1)),
            "periodG":list(range(280423,281075+1)),
            "periodH":list(range(281317,281411+1)),
            "periodJ":list(range(282625,284484+1))}
    }

    if not datayr in dataperiods.keys():
        print("Data year %s not in dictionary"%datayr)
        return -1
    for period in dataperiods[datayr]:
        if rnum in dataperiods[datayr][period]:
            return period
        
    print("Did not find period for run %i"%rnum)
    return -1

def getCategory(did,isData=False):

    if isData:
        if did >= 276262 and did <= 284484:
            period = getDataPeriods("data15",did)
            return "data15_"+period
        if did >= 297730 and did <= 311481:
            period = getDataPeriods("data16",did)
            return "data16_"+period
        print("ERROR \t Could not find category for %i. Returning None" %did)
        return "None"
    
    if did in category.keys():
        return category[did]
    else:
        print("ERROR \t Could not find category for %i. Returning None" %did)
        return "None"

    
def applySkim(skim,df):
    """Defines the skims to be used. Here one can add any new skim one would like"""
    if skim == "exactly4lep":
        df_flat = df.Filter("lep_n == 4","Exactly 4 leptons")
    elif skim == "exactly2bjets":
        df_flat = df.Define("is_bjet","jet_btag_quantile == 2")
        df_flat = df_flat.Define("n_bjet","ROOT::VecOps::Sum(is_bjet,0.)")
        df_flat = df_flat.Filter("n_bjet == 2","At least two b-jets")
    elif skim == "2muons":
        df_flat = df.Define("is_muon","abs(lep_type) == 13")
        df_flat = df_flat.Define("n_muon","ROOT::VecOps::Sum(is_muon,0.)")
        df_flat = df_flat.Filter("n_muon >= 2","At least two muons")
    elif skim == "GamGam":
        df_flat = df.Define("sig_ph","photon_pt > 20")
        df_flat = df_flat.Define("n_sig_ph","ROOT::VecOps::Sum(sig_ph,0.)")
        df_flat = df_flat.Filter("n_sig_ph >= 2","At least two photons")
    elif skim == "3J1LMET30":
        df_flat = df.Define("sig_lep","lep_isTightID")
        df_flat = df_flat.Define("n_sig_lep","ROOT::VecOps::Sum(sig_lep,0.)")
        df_flat = df_flat.Filter("n_sig_lep >= 1","At least one lepton")
        df_flat = df_flat.Filter("jet_n > 3","More than three jets")
        df_flat = df_flat.Filter("met > 30","MET > 30 GeV")
    elif skim == "2J2LMET30":
        df_flat = df.Define("sig_lep","lep_isTightID")
        df_flat = df_flat.Define("n_sig_lep","ROOT::VecOps::Sum(sig_lep,0.)")
        df_flat = df_flat.Filter("n_sig_lep >= 2","At least two leptons")
        df_flat = df_flat.Filter("jet_n >= 2","At least two jets")
        df_flat = df_flat.Filter("met > 30","MET > 30 GeV")
    else:
        df_flat = df.Filter("1.0","nofilter")
        
    return df_flat
    
def getHistograms(tfil,did):
    """Gets the histograms for the cut book keeper information"""
    dirlist = tfil.GetListOfKeys()
    iter = dirlist.MakeIterator()
    key = iter.Next()
    while key:
        cl = ROOT.gROOT.GetClass(key.GetClassName());
        if not cl.InheritsFrom("TH1"): 
            key = iter.Next()
            continue
        if re.match("CutBookkeeper_%i_\d{6}_NOSYS"%did,key.GetName()):
            #print("INFO \t Found histogram %s with event information"%(key.GetName()))
            return key.ReadObj()
    return None

def makeCSV(df,outname):
    """Makes CSV file"""
    df_flat = applySkim(skim,df)

    numpy_col = []
    colnames = df_flat.GetColumnNames()
    for coln in colnames:
        col = str(coln)
        #print(col)
        if "lep_" in col:
            #print("type = ",df_flat.GetColumnType(coln))
            if not "ROOT::VecOps::RVec" in df_flat.GetColumnType(coln):
                continue
            for i in range(0,4):
                newcol = "%s"%(col.replace("lep_","lep%i_"%(i+1)))
                df_flat = df_flat.Define(newcol,f'{col}[{i}]')
                if not newcol in numpy_col:
                    numpy_col.append(newcol)
        if "trigE" in col or "trigM" in col or "ScaleFactor_" in col or "mcWeight" in col or "corrected_xsec" in col or "sum_of_weights" in col or "lep_n" in col or "channelNumber" in col or "runNumber" in col or "eventNumber" in col: 
            #print(col)
            numpy_col.append(col)
    #print(numpy_col)
    #numpy_dic[outname] = df_flat.AsNumpy(numpy_col)
    #print("Writing out to %s with %i events"%(outname,df_flat.Count().GetValue()))
    pandas_df = pd.DataFrame(data=df_flat.AsNumpy(numpy_col))
    pandas_df.to_csv(outname)
    
def createNtup(cat,ntdir,hfdir,uname,did,skim="all",isData=False,corrected_metadata={},doCSV=False):
    """Creates the ROOT NTuple file and adds new branches containing the scaling information"""
    #opts = ROOT.RDF.RSnapshotOptions()
    #opts.fMode = "UPDATE"
    #opts.fOverwriteIfExists = True;
    all_files = []
    hadd_files = []
    got_match = []
    ntupfiles = glob.glob(f'{ntdir}/user.{uname}.*.root')
    histfiles = glob.glob(f'{hfdir}/user.{uname}.*.root')
    inf = 1
    if not isData:
        physdescr = corrected_metadata['physics_short'].strip()
    else:
        physdescr = cat
    #tot_ev = 0
    nev = 0
    sow = 0
    sow2 = 0
    #z = np.zeros(len(ntupfiles), dtype=np.float128)
    for nf in sorted(ntupfiles):
        print("Adding file %i/%i for %s"%(inf,len(ntupfiles),physdescr))
        print(nf)
        #if not "user.egramsta.39704042._000069.output_ntup.root" in nf:
        #    inf += 1
        #    continue
        if isData:
            all_files.append(nf)
            inf += 1
            continue
        #df = ROOT.RDataFrame("analysis",nf)
        #if skim == "4lep":
        #    print("Skimming")
        #    df = df.Filter("lep_n >= 4","At least 4 leptons")
        ntup_file = nf.split("/")[-1]
        hist_file = ntup_file.replace("_ntup.root","_hist.root")
        look_for_hist_file = os.path.join(hfdir,hist_file)
        if not look_for_hist_file in histfiles:
            print("ERROR \t Did not find a matching hist file for %s"%(ntup_file))
            continue
        #print("hist :: %s"%look_for_hist_file)
        #print("ntup :: %s"%nf)
        tfile = ROOT.TFile(look_for_hist_file)
        hist = getHistograms(tfile,did)
        if hist:
            nev  += hist.GetBinContent(1)
            sow  += hist.GetBinContent(2)
            sow2 += hist.GetBinContent(3)
            #df = df.Define("num_events","%f"%hist.GetBinContent(1))
            #df = df.Define("sum_of_weights","%f"%hist.GetBinContent(2))
            #df = df.Define("sum_of_weights_squared","%f"%hist.GetBinContent(3))
            #tot_ev += hist.GetBinContent(1)
            print("%i :: Adding %f, now %f"%(did,hist.GetBinContent(1),nev))
            all_files.append(nf)
            if nf in got_match:
                print("ERROR \t Has already taken into account the weights from %s"%nf)
            got_match.append(nf)
        #df = df.Define("corrected_xsec","%f"%(corrected_metadata['crossSection_pb']*corrected_metadata['genFiltEff']*corrected_metadata['kFactor']))
        #print(physdescr)
        #outname = f'{outdir}/mc_{did}.{physdescr}.{skim}_{inf}.root'
        #df.Snapshot("analysis",outname)#,"",opts)
        #hadd_files.append(outname)
        inf += 1
        #if inf > 3:
        #    break
    #if isData:
    print("Added %i files for %s with %f events and sow %f"%(len(got_match),physdescr,nev,sow))
    df = ROOT.RDataFrame("analysis",all_files)
    #df = df.Define("channelNumber","%i"%did)
    df = applySkim(skim,df)
    
    if isData:
        df = df.Define("num_events","1.0")
        df = df.Define("sum_of_weights","1.0")
        df = df.Define("sum_of_weights_squared","1.0")
        df = df.Define("corrected_xsec","1.0")
        df = df.Define("DatasetNumber","%i"%did)
        df.Snapshot("analysis",f'{outdir}/{cat}.root')
    else:
        df = df.Define("num_events","%f"%nev)
        df = df.Define("sum_of_weights","%f"%sow)
        df = df.Define("sum_of_weights_squared","%f"%sow2)
        df = df.Define("DatasetNumber","%i"%did)
        df = df.Define("corrected_xsec","%f"%(corrected_metadata['crossSection_pb']*corrected_metadata['genFiltEff']*corrected_metadata['kFactor']))
        outname = f'{outdir}/MC/mc_{did}.{physdescr}.{skim}.root'
        if not doCSV:
            df.Snapshot("analysis",outname)#,"",opts)
            print("Created file %s"%outname)
        else:
            print(cat,did)
            outname_csv = outname.replace(".root",".csv")
            if not os.path.isfile(outname_csv):
                makeCSV(df,outname.replace(".root",".csv"))
            
        #haddfilename = "_".join(outname.split("_")[:-1])+".root"
        #haddstr = "hadd -f %s %s"%(haddfilename," ".join(hadd_files))
        #remstr = "rm %s"%" ".join(hadd_files)
        #os.system(haddstr)
        #os.system(remstr)
        #print(haddstr)
        #print(remstr)
    del df
    
    return all_files, metadata_dic
    

def getMetadata(cat,ntdir,hfdir,uname,did,isData=False,corrected_metadata={}):
    """Reads the metadata from the csv file from the OpenData web page 
    and combines this with the information stored in the histograms 
    from the ntuple production"""
    all_files = []
    got_match = []
    if not isData:
        metadata_dic = {"num_events":0.0,"sum_of_weights":0.0,"sum_of_weights_squared":0.0,"category":cat,"corrected_xsec":corrected_metadata['crossSection_pb']*corrected_metadata['genFiltEff']*corrected_metadata['kFactor'],"dsid":did}
    else:
        metadata_dic = {"num_events":1.0,"sum_of_weights":1.0,"sum_of_weights_squared":1.0,"category":cat,"corrected_xsec":1.0,"dsid":did}
    ntupfiles = glob.glob(f'{ntdir}/user.{uname}.*.root')
    histfiles = glob.glob(f'{hfdir}/user.{uname}.*.root')
    inf = 0
    for nf in sorted(ntupfiles):
        #print("Adding file %i/%i : %s"%(inf,len(ntupfiles),nf))
        #if not "user.egramsta.39484931._000019.output_ntup.root" in nf:
        #    continue
        if isData:
            all_files.append(nf)
            inf += 1
            continue
        ntup_file = nf.split("/")[-1]
        hist_file = ntup_file.replace("_ntup.root","_hist.root")
        look_for_hist_file = os.path.join(hfdir,hist_file)
        if not look_for_hist_file in histfiles:
            print("ERROR \t Did not find a matching hist file for %s"%(ntup_file))
            continue
        #print("hist :: %s"%look_for_hist_file)
        #print("ntup :: %s"%nf)
        tfile = ROOT.TFile(look_for_hist_file)
        hist = getHistograms(tfile,did)
        if hist:
            metadata_dic["num_events"] += hist.GetBinContent(1)
            metadata_dic["sum_of_weights"] += hist.GetBinContent(2)
            metadata_dic["sum_of_weights_squared"] += hist.GetBinContent(3)
            #print("%i :: Adding %f, now %f"%(did,hist.GetBinContent(1),metadata_dic["num_events"]))
            all_files.append(nf)
            if nf in got_match:
                print("ERROR \t Has already taken into account the weights from %s"%nf)
            got_match.append(nf)
        inf += 1
    print("Added %i files for %s"%(inf,cat))
            
    return all_files, metadata_dic
    

def mergeSamples(RDF_spec,all_categories,outdir,skim="",doCSV=False):
    """This is run on data and merges all the files into one file per run period"""
    df = ROOT.RDF.Experimental.FromSpec(RDF_spec)
    df = df.DefinePerSample("num_events",'rdfsampleinfo_.GetD("num_events")')
    df = df.DefinePerSample("sum_of_weights",'rdfsampleinfo_.GetD("sum_of_weights")')
    df = df.DefinePerSample("sum_of_weights_squared",'rdfsampleinfo_.GetD("sum_of_weights_squared")')
    df = df.DefinePerSample("corrected_xsec",'rdfsampleinfo_.GetD("corrected_xsec")')
    df = df.DefinePerSample("dsid",'rdfsampleinfo_.GetI("dsid")')
    df = df.DefinePerSample("category",'rdfsampleinfo_.GetS("category")')

    df = applySkim(skim,df)
        
    if doCSV:
        for col in df.GetColumnNames():
            numpy_col = []
            if "lep_" in col or "trigE" in col or "trigM" in col or "scaleFactor" in col or "mcWeight" in col or "corrected_xsec" in col or "sum_of_weights" in col:
                numpy_col.append(col)
    
    for cat in all_categories:
        if not "data" in cat:
            continue
        print("INFO \t Merging %s"%cat)
        start_time = time.time()
        if not doCSV:
            df.Filter(f'category == "{cat}"').Snapshot("analysis",f'{outdir}/Data/{cat}.root')
            ftype = "root"
        elif doCSV:
            makeCSV(df.Filter(f'category == "{cat}"'),f'{outdir}/Data/{cat}.csv')
            ftype = "csv"
        end_time = time.time()
        td = timedelta(seconds=end_time-start_time)
        print(f'INFO \t Created merged file {outdir}/Data/{cat}.{ftype} in hh:mm:ss:', td)

if __name__=="__main__":
    
    required = ["directory"]

    RDF_spec = {"samples":{}}


    grl = [line.rstrip() for line in open('lumitable_2015.csv')]
    grl += [line.rstrip() for line in open('lumitable_2016.csv')]
    header = grl[0].split(",")[1:]
    grl_dic = {}
    for i in range(1,len(grl)):
        sp = grl[i].split(",")
        did = sp[0]
        if not did in grl_dic.keys():
            grl_dic[did] = dict(zip(header, sp[1:]))

    metadata = [line.rstrip() for line in open('metadata_new.csv')]
    header = metadata[0].split(",")[1:]
    metadata_dic = {}
    for i in range(1,len(metadata)):
        sp = metadata[i].split(",")
        did = sp[0]
        if not did in metadata_dic.keys():
            metadata_dic[did] = dict(zip(header, sp[1:]))
    for did in metadata_dic.keys():
        metadata_dic[did]['crossSection_pb'] = float(metadata_dic[did]['crossSection_pb'])
        metadata_dic[did]['genFiltEff'] = float(metadata_dic[did]['genFiltEff'])
        metadata_dic[did]['kFactor'] = float(metadata_dic[did]['kFactor'])
        if did in ['346645','346646','346647','345066','346414','346511']: # H -> ZZ -> 4l (incl. tau)
            metadata_dic[did]['genFiltEff'] *= 2.745E-04
        elif did in ['345060','346228','346340','346341','346342']: # H -> ZZ -> 4l (no tau)
            metadata_dic[did]['genFiltEff'] *= 1.240E-04
            if did in ['345060']:
                metadata_dic[did]['kFactor'] *= 1.45
        elif did in ['345322','346198','345320','345321','345316','345833']: #H -> Zgamma
            metadata_dic[did]['genFiltEff'] *= 1.533E-03
            if did in ['345833','345321']:
                metadata_dic[did]['genFiltEff'] *= 0.01022121 # Need to take into account the Z->l+l-
        elif did in ['345218','345124']: # H -> etau
            metadata_dic[did]['genFiltEff'] *= 1.0e-5
        elif did in ['345219','345125']: # H -> mutau
            metadata_dic[did]['genFiltEff'] *= 1.0e-5
        elif did in ['345876','345877']: # H -> ee 
            metadata_dic[did]['genFiltEff'] *= 2.176E-04/40000.
        elif did in ['346524']: # H -> WW -> lvlv (incl. tau)
            metadata_dic[did]['genFiltEff'] *= 2.338E-02
        elif did in ['345965','345961','345834']: # H -> gamgam
            metadata_dic[did]['genFiltEff'] *= 2.270E-03
        elif did in ['346927','345217']: # H -> tautau
            metadata_dic[did]['genFiltEff'] *= 6.272E-02
        elif did in ['450576']: # H -> ZZ -> llbb (calculated by taking: 2.619E-02*(((15.12/100.)*(2*(3.37)/100.))*2) : BR(H->ZZ)*2*[Z->bb*Z->ee/mm])
            metadata_dic[did]['genFiltEff'] *= 0.00053

    '''
    gpt.uio.no
    >> What is the theoretical branching ratio for Higgs decaying to a muon and a tau lepton?
    >> The theoretical branching ratio for the Higgs boson decaying to a muon and a tau lepton is extremely low, on the order of 10^-5 or even lower. This is because the muon and tau lepton are both heavy particles, and the Higgs boson prefers to decay to lighter particles such as bottom quarks, W and Z bosons, and photons. As a result, the decay to a muon and a tau lepton is highly suppressed, making it a very rare event in Higgs boson decays.
    '''

#    sys.exit()
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-d", "--directory", action="store", dest="directory", help="Directory where files are downloaded")
    parser.add_option("-t", "--tag", action="store", dest="tag", default = "", help="Tag used in grid processing")
    parser.add_option("-u", "--uname", action="store", dest="uname", default = "", help="Username of grid jobs (i.e. your CERN username)")
    parser.add_option("-o", "--outdir", action="store", dest="outdir", default = "", help="Directory to store output ntuples")
    parser.add_option("-c", "--ncpu", action="store", dest="ncpu", default = 0, help="Number of cpus to use in merging")
    parser.add_option("-g", "--getsamples", action="store", dest="getsamples", default = 0, help="If just want to dump list of samples")
    parser.add_option("-j", "--specfile", action="store", dest="specfile", default = "", help="If specification file already exists")
    parser.add_option("-s", "--skim", action="store", dest="skim", default ="all", help="If want to makes skimmed samples (currently 4lep, exactly4lep, GamGam, exactly2bjets and 2muons are supported)")
    parser.add_option("-p", "--proc", action="store", dest="proc", default="all", help="Comma separated list of processes to include (e.g. Higgs,ttbar,Diboson etc)")
    parser.add_option("-k", "--csv", action="store", dest="csv", default=0, help="Creates CSV files as output")

    (opts, args) = parser.parse_args()
    getsamples = int(opts.__dict__["getsamples"])
    if not getsamples:
        for r in required:
            if opts.__dict__[r] is None:
                parser.error("parameter %s required"%r)
    else:
        samples = getListOfSamples()
        f = open("all_samples.txt","w")
        f.write("\n".join(samples))
        f.close()
        exit(0)

    # this is the list of input files
    directory = opts.__dict__["directory"]
    tag = opts.__dict__["tag"]
    uname = opts.__dict__["uname"]
    outdir = opts.__dict__["outdir"]
    specfile = opts.__dict__["specfile"]
    skim = opts.__dict__["skim"]
    ncpu = int(opts.__dict__["ncpu"])
    procs = opts.__dict__["proc"].split(",")
    csv = int(opts.__dict__["csv"])

    doData = False
    

    if ncpu:
        print("Using %i CPUs"%ncpu)
        ROOT.EnableImplicitMT(ncpu)

    if not os.path.isdir(outdir):
        print("INFO \t Creating directory %s" %outdir)
        os.makedirs(outdir)
    if not os.path.isdir(outdir+"/MC/"):
        print("INFO \t Creating directory %s" %(outdir+"/MC/"))
        os.makedirs((outdir+"/MC/"))
    if not os.path.isdir(outdir+"/Data/"):
        print("INFO \t Creating directory %s" %(outdir+"/Data/"))
        os.makedirs((outdir+"/Data/"))

    #list_of_dirs = [x[0] for x in os.walk(f'{directory}/user.{uname}.*.{tag}.*_ntup')]
    #list_of_dirs = [x[0] for x in os.walk(f'user.egramsta.OD240524.410470.e6337_s3681_r13167_p6026_output_ntup')]
    ntup_files = []
    if not specfile:
        specfile = f'specification_{tag}.json'
        ntup_files = [js for js in locate(f'user.{uname}*{tag}.*_ntup', directory)]
        #ntup_files = [js for js in locate(f'user.egramsta.OD240524.410470.e6337_s3681_r13167_p6026_output_ntup', directory)]
    else:
        f = open (specfile, "r")
        RDF_spec = json.loads(f.read())

    #sys.exit()
    intlumi = 0.0
    added_runs = 0.0
    all_categories = []
    for nf in ntup_files:
        
        ntup_dir = nf.split("/")[-1]
        hf = os.path.join(directory,ntup_dir.replace("_ntup","_hist"))
        if not os.path.isdir(hf):
            print("ERROR \t Could not find directory with histograms for %s.\n Would not be possible to get event information"%hist_dir)
            continue

        # Check first if data
        isData = False
        result = re.search(r'\.\d{8}\.', ntup_dir)
        if result:
            did = int(result.group()[1:-1])
            dickey = "%i"%(did)
            isData = True
            if dickey in grl_dic.keys():
                intlumi += float(grl_dic[dickey][' LAr Corrected'])
                added_runs += 1
            else:
                print("ERROR \t Could not find luminosity for run %s"%dickey)
        # If not data
        if not isData:
            result = re.search(r'\.\d{6}\.', ntup_dir)
            if result:
                did = int(result.group()[1:-1])
                #print(did)
            else:
                print("ERROR \t Could not find did in directory name %s"%dwn_dir)
                continue

        if did in [346343,346344,346345]:
            print("INFO \t Skipping did %i"%did)
            continue
        dickey = "%i"%(did)

        cat = getCategory(did,isData)

        if not isData and not "all" in procs and cat not in procs:
            #print("INFO \t Skipping category %s since not specified in arguments"%cat)
            continue
        elif isData and not "all" in procs:
            found = 0
            for p in procs:
                if p in cat:
                    found = 1
                    break
            if not found:
                print("Skipping %s"%cat)
                continue

        if not cat in all_categories:
            all_categories.append(cat)

        if not isData and not dickey in metadata_dic.keys():
            print("WARNING \t Missing metadata information for %s in category %s"%(dickey,cat))
            continue

        if not isData:
            print("Cat is %s"%cat)
            all_files, metadata = createNtup(cat,nf,hf,uname,did,skim,isData,metadata_dic[dickey],csv)
            #all_files, metadata = getMetadata(cat,nf,hf,uname,did,isData,metadata_dic[dickey])
        else:
            #all_files, metadata = createNtup(cat,nf,hf,uname,did,skim,isData)#,metadata_dic[dickey])
            all_files, metadata = getMetadata(cat,nf,hf,uname,did,isData)
            #all_files, metadata = createNtup(cat,nf,hf,uname,did,skim,isData)
            
            if not dickey in RDF_spec["samples"].keys():
                RDF_spec["samples"][dickey] = {"trees":["analysis"],
                                               "files":all_files,
                                               "metadata":metadata}
            else:
                print("ERROR \t DSID %s already in dictionary"%dickey)

            doData = True

    with open(specfile, 'w') as f:
        json.dump(RDF_spec, f)
    if doData:
        mergeSamples(specfile,all_categories,outdir,skim,csv)
    #list_of_dirs = glob.glob(f'{directory}/user.{uname}.*.{tag}.*_ntup')

    
    

    
