import os
import sys
from optparse import OptionParser
import glob
import fnmatch
import re
import ROOT
import json
import time
from datetime import timedelta
import pandas as pd
import numpy as np

def locate(pattern, root_path):
    """
    Small function to locate all files in a directory matching a pattern.
    Returns a list of files matching criteria.
    Arguments:
    pattern - pattern to match
    directory - directory to look in 
    """
    for path, dirs, files in os.walk(os.path.abspath(root_path)):
        for dirname in fnmatch.filter(dirs, pattern):
            yield os.path.join(path, dirname)



def getPeriodsDef():
    """
    Returns the dictionary containing the range of runs 
    belonging to a given data taking period in each 
    data taking year
    """
    dataperiods = {
        "data16":{
            "periodA":list(range(297730,300279+1)),
            "periodB":list(range(300345,300908+1)),
            "periodC":list(range(301912,302393+1)),
            "periodD":list(range(302737,303560+1)),
            "periodE":list(range(303638,303892+1)),
            "periodF":list(range(303943,304494+1)),
            "periodG":list(range(305380,306451+1)),
            "periodI":list(range(307126,308084+1)),
            "periodK":list(range(309375,309759+1)),
            "periodL":list(range(310015,311481+1))
        },
        "data15":{
            "periodD":list(range(276262,276954+1)),
            "periodE":list(range(278880,279928+1)),
            "periodF":list(range(279932,280368+1)),
            "periodG":list(range(280423,281075+1)),
            "periodH":list(range(281317,281411+1)),
            "periodJ":list(range(282625,284484+1))}
    }

    return dataperiods

def getDataPeriods(datayr,rnum):
    """
    Assign data to the right data taking period based on run number
    Returns the period the run belongs to, 
    returns -1 if run is not found in any data taking period
    Arguments:
    datayr – data taking year (data15, data16, etc.)
    rnum   –  run number
    """
    dataperiods = getPeriodsDef()

    if not datayr in dataperiods.keys():
        print("Data year %s not in dictionary"%datayr)
        return -1
    for period in dataperiods[datayr]:
        if rnum in dataperiods[datayr][period]:
            return period
        
    print("Did not find period for run %i"%rnum)
    return -1

def getCategory(did,isData=False):
    """
    Retrieves the category (i.e. data taking period for data 
    and production process for simulations)
    Arguments:
    did – identifier (either a run number/dataset number or data15/data16)
    isData  – if this is real data or simulations
    """
    if isData:
        # In the case we run over full data containers (not run-by-run containers) the did is just data15 or data16
        if not type(did) is int:
            return did
        if did >= 276262 and did <= 284484:
            period = getDataPeriods("data15",did)
            return "data15_"+period
        if did >= 297730 and did <= 311481:
            period = getDataPeriods("data16",did)
            return "data16_"+period
        print("ERROR \t Could not find category for %i. Returning None" %did)
        return "None"
    return "MC"
 
def applySkim(skim,df):
    """
    Defines all the skims and applies them using the RDataFrame::Filter 
    function (se RDF documentation). Returns a filtered data frame
    according to the chosen skim
    Arguments:
    skim – name of skim (feel free to add additional skims)
    df  – the RDF to apply the skim on
    """
    if skim == "exactly4lep":
        df_flat = df.Filter("lep_n == 4","Exactly 4 leptons")
    elif skim == "2to4lep":
        df_flat = df.Filter("lep_n >= 2 && lep_n <= 4","2, 3 or 4 leptons")
    elif skim == "exactly2bjets":
        df_flat = df.Define("is_bjet","jet_btag_quantile == 2")
        df_flat = df_flat.Define("n_bjet","ROOT::VecOps::Sum(is_bjet,0.)")
        df_flat = df_flat.Filter("n_bjet == 2","At least two b-jets")
    elif skim == "2muons":
        df_flat = df.Define("is_muon","abs(lep_type) == 13 && lep_pt > 10")
        df_flat = df_flat.Define("n_muon","ROOT::VecOps::Sum(is_muon,0.)")
        df_flat = df_flat.Filter("n_muon >= 2","At least two muons")
    elif skim == "GamGam":
        df_flat = df.Define("sig_ph","photon_pt > 20")
        df_flat = df_flat.Define("n_sig_ph","ROOT::VecOps::Sum(sig_ph,0.)")
        df_flat = df_flat.Filter("n_sig_ph >= 2","At least two photons")
    elif skim == "3J1LMET30":
        df_flat = df.Define("sig_lep","lep_isTightID")
        df_flat = df_flat.Define("n_sig_lep","ROOT::VecOps::Sum(sig_lep,0.)")
        df_flat = df_flat.Filter("n_sig_lep >= 1","At least one lepton")
        df_flat = df_flat.Filter("jet_n > 3","More than three jets")
        df_flat = df_flat.Filter("met > 30","MET > 30 GeV")
    elif skim == "2J2LMET30":
        df_flat = df.Define("sig_lep","lep_isTightID")
        df_flat = df_flat.Define("n_sig_lep","ROOT::VecOps::Sum(sig_lep,0.)")
        df_flat = df_flat.Filter("n_sig_lep >= 2","At least two leptons")
        df_flat = df_flat.Filter("jet_n >= 2","At least two jets")
        df_flat = df_flat.Filter("met > 30","MET > 30 GeV")
    elif skim == "1LMET30":
        df_flat = df.Define("sig_lep","lep_isLooseID")
        df_flat = df_flat.Define("n_sig_lep","ROOT::VecOps::Sum(sig_lep,0.)")
        df_flat = df_flat.Filter("n_sig_lep >= 1","At least 1 loose leptons")
        df_flat = df_flat.Filter("met > 30","MET > 30 GeV")
    else:
        print("WARNING \t No filters applied!")
        df_flat = df.Filter("1.0","nofilter")
        
    return df_flat
    
def getHistograms(tfil,did):
    """
    Retrieves the so-called CutBookkeeper 
    histograms which stores the sum of weights. 
    This value represents the cumulative total of 
    weights across all events in the dataset. 
    This is useful for normalizing the dataset to represent 
    physical quantities like cross-sections more accurately.
    Returns pointer to histogram
    Arguments:
    tfile – ROOT TFile object of file containing the histogram (from grid jobs)
    did  – unique data set identifier of the simulated sample 
    """
    dirlist = tfil.GetListOfKeys()
    iter = dirlist.MakeIterator()
    key = iter.Next()
    while key:
        cl = ROOT.gROOT.GetClass(key.GetClassName());
        if not cl.InheritsFrom("TH1"): 
            key = iter.Next()
            continue
        if re.match("CutBookkeeper_%i_\d{6}_NOSYS"%did,key.GetName()):
            #print("INFO \t Found histogram %s with event information"%(key.GetName()))
            return key.ReadObj()
    return None

def makeCSV(df,outname):
    """
    Can write skimmed ntuples as comma separated values (CSV)
    converting the RDF to pandas data frame. It can only write 
    non-vector objects and thus information stored as vectors (as e.g
    lepton information) must be flattened by e.g. converting 
    lep_pt to lep1_pt, lep2_pt etc. As is shown below. It skips any 
    variable which is a vector in the ntuples. 
    OBS: be aware that the process of converting RDF -> CSV comsumes 
    a lot of memory and might be problematic for large files with many events
    (agressive skimming is recommended when using this). Returns list of files added.
    Arguments:
    df – data frame to convert to csv
    outname  – name of output csv file
    """
    df_flat = applySkim(skim,df)
    # List to store columns/variables we want to write to CSV
    numpy_col = []
    colnames = df_flat.GetColumnNames()
    ## Loop through  all the columns
    for coln in colnames:
        col = str(coln)
        if "lep_" in col:
            # If a vector variable we flatten it (converting lep_pt -> lep1_pt .. lep4_pt)
            # If not a vector we can write it deirectly to CSV (needs to be added below)
            if not "ROOT::VecOps::RVec" in df_flat.GetColumnType(coln):
                continue
            # This is for a 4-lepton skim so we always know we will have 4 leptons
            for i in range(0,4):
                newcol = "%s"%(col.replace("lep_","lep%i_"%(i+1)))
                # Defines a new columns from the vector value
                df_flat = df_flat.Define(newcol,f'{col}[{i}]')
                if not newcol in numpy_col:
                    numpy_col.append(newcol)
        # Add some additional variables to the CSV file by hand
        if "trigE" in col or "trigM" in col or "ScaleFactor_" in col or "mcWeight" in col or "corrected_xsec" in col or "sum_of_weights" in col or "lep_n" in col or "channelNumber" in col or "runNumber" in col or "eventNumber" in col: 
            numpy_col.append(col)
    # Converts to CSV file (OBS: problematic for files containing many events!)
    pandas_df = pd.DataFrame(data=df_flat.AsNumpy(numpy_col))
    pandas_df.to_csv(outname)
    
def createNtup(cat,ntdir,hfdir,uname,did,skim="all",isData=False,corrected_metadata={},doCSV=False):
    """
    Produces the skimmed ntuple with the additional variables such as the scale factors and sum of weights
    Arguments:
    cat – category
    ntdir  – directory containing all the ntuple files for a given process from grid running
    hfdir  – directory containing all the histogram (i.e. CutBookKeeper) files for a given process from grid running
    uname  - username used when producing the ntuples on the grid
    skim   - name of skim to apply (default: no skim)
    isData - if it is data (default: false)
    corrected_metadata - dictionary with updated metadata information (defualt: empty dictionary)
    doCSV - if one wants to produce CSV files as output (default: false)
    """
    all_files = []
    hadd_files = []
    got_match = []
    # Retrieves all the files from the two directories containing the ntuples and the histograms
    ntupfiles = glob.glob(f'{ntdir}/user.{uname}.*.root')
    histfiles = glob.glob(f'{hfdir}/user.{uname}.*.root')
    inf = 1
    if not isData:
        metadata_dic = {"num_events":0.0,"sum_of_weights":0.0,"sum_of_weights_squared":0.0,"category":cat,
                        "corrected_xsec":float(corrected_metadata['my_total']) if 'my_total' in corrected_metadata.keys() else float(corrected_metadata['crossSection_pb'])*float(corrected_metadata['genFiltEff'])*float(corrected_metadata['kFactor']),
                        "dsid":did}
        physdescr = corrected_metadata['physics_short'].strip() 
    else:
        physdescr = cat
        metadata_dic = {"num_events":1.0,"sum_of_weights":1.0,"sum_of_weights_squared":1.0,"category":cat,"corrected_xsec":1.0,"dsid":did if type(did) is int else (2016 if did == "data16" else 2015)}
    
    
    nev = 0
    sow = 0
    sow2 = 0
    
    # Loop over all files and retrieves the sum of weights (sow),
    # sum of weights squared (sow2) and the sum of unweighted events (nev)
    # from the CutBookKeeper (CBK) histograms
    for nf in sorted(ntupfiles):
        print("Adding file %i/%i for %s"%(inf,len(ntupfiles),physdescr))
        ## If data no need to look at CBK
        if isData:
            all_files.append(nf)
            inf += 1
            continue
        
        # It is very important that every ntuple file has a corresponding histogram file,
        # if not the scaling will be off
        ntup_file = nf.split("/")[-1]
        hist_file = ntup_file.replace("_ntup.root","_hist.root")
        look_for_hist_file = os.path.join(hfdir,hist_file)
        if not look_for_hist_file in histfiles:
            # If could not find CBK skip the ntuples file
            print("ERROR \t Did not find a matching hist file for %s"%(ntup_file))
            continue

        tfile = ROOT.TFile(look_for_hist_file)
        hist = getHistograms(tfile,did)
        if hist:
            nev  += hist.GetBinContent(1)
            sow  += hist.GetBinContent(2)
            sow2 += hist.GetBinContent(3)
            all_files.append(nf)
            if nf in got_match:
                print("ERROR \t Has already taken into account the weights from %s"%nf)
            got_match.append(nf)
        inf += 1
        
    print("INFO \t Added %i files for %s with %f events and sow %f"%(len(got_match),physdescr,nev,sow))

    # If this is data we want to merge all of data together at the end,
    # so we will only write to ntuple when all files have been processed.
    # Thus now returning the files from this input 
    if isData: return all_files, metadata_dic
    
    # If MC we want to create one ntuple file per MC process
    # and we define an RDF with all the files that passed above
    df = ROOT.RDataFrame("analysis",all_files)
    df = applySkim(skim,df)
    # Add some additional variables (based on information from CBK)
    if isData:
        # For data these are mostly set to 1
        df = df.Define("num_events","1.0")
        df = df.Define("sum_of_weights","1.0")
        df = df.Define("sum_of_weights_squared","1.0")
        df = df.Define("corrected_xsec","1.0")
        df = df.Define("DatasetNumber","%i"%did)
        df.Snapshot("analysis",f'{outdir}/{cat}.root')
    else:
        df = df.Define("num_events","%f"%nev)
        df = df.Define("sum_of_weights","%f"%sow)
        df = df.Define("sum_of_weights_squared","%f"%sow2)
        df = df.Define("DatasetNumber","%i"%did)
        df = df.Define("corrected_xsec","%f"%(float(corrected_metadata['my_total']) if 'my_total' in corrected_metadata.keys() else float(corrected_metadata['crossSection_pb'])*float(corrected_metadata['genFiltEff'])*float(corrected_metadata['kFactor'])))
        # Name of output file
        outname = f'{outdir}/MC/mc_{did}.{physdescr}.{skim}.root'
        if not doCSV:
            df.Snapshot("analysis",outname)#,"",opts)
            print("INFO \t Created file %s"%outname)
        else:
            outname_csv = outname.replace(".root",".csv")
            if not os.path.isfile(outname_csv):
                makeCSV(df,outname.replace(".root",".csv"))
    # Clean up
    del df
    return all_files, metadata_dic
    

def mergeSamples(RDF_spec,all_categories,outdir,skim="",doCSV=False):
    """
    This is run on data only and merges all the files into one file per run period
    Arguments:
    RDF_spec – json-file used as input to create a RDF for all files belonging to a data taking period
    all_categories  – directory containing all the ntuple files for a given process from grid running
    outdir  – output directory to store the ntuples
    skim   - name of skim to be added to file name  (default: empty string)
    doCSV - if one wants to produce CSV files as output (default: false)
    """

    # Use the FromSpec to define data frame and add extra variables with information stored in the
    # json-file. Please see documentation in ROOT:
    # https://root.cern.ch/doc/master/namespaceROOT_1_1RDF_1_1Experimental.html#a7193987f3c1b65c649399656cc6acce8
    df = ROOT.RDF.Experimental.FromSpec(RDF_spec)
    df = df.DefinePerSample("num_events",'rdfsampleinfo_.GetD("num_events")')
    df = df.DefinePerSample("sum_of_weights",'rdfsampleinfo_.GetD("sum_of_weights")')
    df = df.DefinePerSample("sum_of_weights_squared",'rdfsampleinfo_.GetD("sum_of_weights_squared")')
    df = df.DefinePerSample("corrected_xsec",'rdfsampleinfo_.GetD("corrected_xsec")')
    df = df.DefinePerSample("dsid",'rdfsampleinfo_.GetI("dsid")')
    df = df.DefinePerSample("category",'rdfsampleinfo_.GetS("category")')
    # Applies the skim
    df = applySkim(skim,df)

    # We want to split the data into data taking periods
    periodDef = getPeriodsDef()
    for cat in all_categories:
        # Should not happen, but just i case
        if not "data" in cat:
            print("WARNING \t Only to be run on data and %s is not data afaics"%cat)
            continue
        start_time = time.time()
        # If produce root-file
        if not doCSV:
            # If we need to define cat based on run numbers
            # This may be the case if onew produced ntuples from the
            # data containers
            if cat in periodDef.keys():
                for period in periodDef[cat].keys():
                    start = periodDef[cat][period][0]
                    stop = periodDef[cat][period][-1]
                    print("INFO \t Creating %s for %s with %i <= RunNumber <= %i"%(period,cat,start,stop))
                    start_time = time.time()
                    df.Filter(f'runNumber >= {start} && runNumber <= {stop}').Snapshot("analysis",f'{outdir}/Data/{cat}_{period}.root')
                    end_time = time.time()
                    td = timedelta(seconds=end_time-start_time)
                    print(f'INFO \t Created merged file {outdir}/Data/{cat}_{period}.root in hh:mm:ss:', td)
            # If input was produced on the run-by-run containers we have the period information in the category variable
            else:
                df.Filter(f'category == "{cat}"').Snapshot("analysis",f'{outdir}/Data/{cat}.root')
            ftype = "root"
        # If produce csv-file
        elif doCSV:
            makeCSV(df.Filter(f'category == "{cat}"'),f'{outdir}/Data/{cat}.csv')
            ftype = "csv"
        end_time = time.time()
        td = timedelta(seconds=end_time-start_time)
        print(f'INFO \t Created merged file {outdir}/Data/{cat}.{ftype} in hh:mm:ss:', td)

def getDSwithProperTags(origDS,priorityDS,mainTag, priorityTags):
    """
    Sometimes grid jobs need to be re-run with a different name (because the original task failed).
    Then one would like to combine files from several grid jobs with slightly different names when creating the ntuples. 
    This function allows you to specify which other tags to look for and will combine files based on this. If files exists 
    with both tags it will prioritize the one in priorityTags-
    Returns all files containing any of the specified tags.
    Arguments:
    origDS - all files with the original tag
    priorityDS - all files with other allowed tags
    mainTag - main tag used for the majority of the files
    priorityTags - all other tags to cinsider
    """
    ntup_files = []
    for ds in origDS:
        foundNewDS = False
        # Check if DS exist with priority tags
        for pritags in priorityTags:
            origpt = ds.replace(mainTag,pritags)
            # Found a match. Switch to priority tag sample
            if origpt in priorityDS:
                ntup_files.append(origpt)
                foundNewDS = True
                break
            if foundNewDS: break
        if not foundNewDS:
            ntup_files.append(ds) # keep original
    return ntup_files

if __name__=="__main__":
    
    required = ["directory"
    RDF_spec = {"samples":{}}

    ## Gets the luminosity for every run
    grl = [line.rstrip() for line in open('lumitable_2015.csv')]
    grl += [line.rstrip() for line in open('lumitable_2016.csv')]
    header = grl[0].split(",")[1:]
    grl_dic = {}
    ## Making a dictionary with luminosity for every run
    for i in range(1,len(grl)):
        sp = grl[i].split(",")
        did = sp[0]
        if not did in grl_dic.keys():
            grl_dic[did] = dict(zip(header, sp[1:]))
    # Preliminary (since Higgs cross sections are all over the place!)
    if os.path.isfile('metadata_new_withcorrections.json'):
        with open('metadata_new_withcorrections.json', 'r') as file:
            metadata_dic = json.load(file)
    else:
        # Reads the metadata downloaded from the OpenData page (https://opendata.atlas.cern/files/metadata.csv)
        metadata = [line.rstrip() for line in open('metadata_new.csv')]
        header = metadata[0].split(",")[1:]
        metadata_dic = {}
        for i in range(1,len(metadata)):
            sp = metadata[i].split(",")
            did = sp[0]
            if not did in metadata_dic.keys():
                metadata_dic[did] = dict(zip(header, sp[1:]))

    ## Input arguments (use --help for instructions)
    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-d", "--directory", action="store", dest="directory", help="Directory where files are downloaded")
    parser.add_option("-t", "--tag", action="store", dest="tag", default = "", help="Tag used in grid processing")
    parser.add_option("-r", "--prioritytag", action="store", dest="prioritytag", default = "", help="Comma seperated list of priority tags. If data set is found with any of these tags use those in stead of the ones with nominal tag specified through -t (--tag) option. Tags are prioritized in the order they are specified (first tag most important)")
    parser.add_option("-u", "--uname", action="store", dest="uname", default = "", help="Username of grid jobs (i.e. your CERN username)")
    parser.add_option("-o", "--outdir", action="store", dest="outdir", default = "", help="Directory to store output ntuples")
    parser.add_option("-c", "--ncpu", action="store", dest="ncpu", default = 0, help="Number of cpus to use in merging. Default: 0 (i.e. single threaded)")
    parser.add_option("-j", "--specfile", action="store", dest="specfile", default = "", help="If specification file already exists")
    parser.add_option("-s", "--skim", action="store", dest="skim", default ="all", help="If want to makes skimmed samples (look in the applySkim() function for all supported skims)")
    parser.add_option("-p", "--proc", action="store", dest="proc", default="all", help="Specify data or mc if only want to run one of them. Default is all (i.e. both)")
    parser.add_option("-k", "--csv", action="store", dest="csv", default=0, help="Creates CSV files as output. Default is not to create CSV. Be aware that this may cause memory issues if not a very aggressive skimmming is applied (e.g. 4 leptons).")

    (opts, args) = parser.parse_args()
    
    # getting the arguments
    directory = opts.__dict__["directory"]
    tag = opts.__dict__["tag"]
    prioritytag = opts.__dict__["prioritytag"]
    uname = opts.__dict__["uname"]
    outdir = opts.__dict__["outdir"]
    specfile = opts.__dict__["specfile"]
    skim = opts.__dict__["skim"]
    ncpu = int(opts.__dict__["ncpu"])
    procs = opts.__dict__["proc"].split(",")
    csv = int(opts.__dict__["csv"])

    doData = False

    ## Collecting all the tags to consider for the ntuple production
    ## This is handy when productions consists of several grid campaigns
    ## (due to failed/broken jobs in the first iteration)
    alltags = []
    if prioritytag:
        alltags = prioritytag.split(",")

    ## If to be run in multi threaded mode
    if ncpu:
        print("Using %i CPUs"%ncpu)
        ROOT.EnableImplicitMT(ncpu)
    ## Create directory to store output if does not exists 
    if not os.path.isdir(outdir):
        print("INFO \t Creating directory %s" %outdir)
        os.makedirs(outdir)
    ## MC simulations will be stored in outdir/MC/
    if not os.path.isdir(outdir+"/MC/"):
        print("INFO \t Creating directory %s" %(outdir+"/MC/"))
        os.makedirs((outdir+"/MC/"))
    ## DATA simulations will be stored in outdir/Data/
    if not os.path.isdir(outdir+"/Data/"):
        print("INFO \t Creating directory %s" %(outdir+"/Data/"))
        os.makedirs((outdir+"/Data/"))

    ## Collects all the files to be used as input to the ntuples
    ntup_files = []
    additional_ntup_files = []
    ## If not a json-file with all the information has been specified
    if not specfile:
        specfile = f'specification_{tag}.json'
        # Files with the nominal tags
        ntup_files = [js for js in locate(f'user.{uname}*{tag}.*_ntup', directory)]
        # Collects files for all input tags (if any)
        for at in alltags:
            additional_ntup_files = [js for js in locate(f'user.{uname}*{at}.*_ntup', directory)]
        getDSwithProperTags(ntup_files,additional_ntup_files,tag,alltags)
    ## If a json file with all the information exists (and has been specified as input parameter) no need to do it again
    else:
        f = open (specfile, "r")
        RDF_spec = json.loads(f.read())

    intlumi = 0.0
    added_runs = 0.0
    all_categories = []
    ## Loop over the ntuple files
    for nf in ntup_files:
        ntup_dir = nf.split("/")[-1]
        ## Find the corresponding histogram files
        hf = os.path.join(directory,ntup_dir.replace("_ntup","_hist"))
        if not os.path.isdir(hf):
            print("ERROR \t Could not find directory with histograms for %s.\n Would not be possible to get event information"%hist_dir)
            continue
        # Check first if data
        ## There are two possibilities of running on the OpenData for research samples:
        ## 1) run-by-run containers
        ## 2) data taking periods (data15/data16) containers
        isData = False
        result = re.search(r'\.\d{8}\.', ntup_dir)
        if result or ('data15' in ntup_dir or 'data16' in ntup_dir):
            try:
                did = int(result.group()[1:-1])
                dickey = "%i"%(did)
            except:
                did = "data16" if 'data16' in ntup_dir else "data15"
                dickey = "%s"%(did)
            isData = True
            if dickey in grl_dic.keys():
                intlumi += float(grl_dic[dickey][' LAr Corrected'])
                added_runs += 1
            else:
                print("ERROR \t Could not find luminosity for run %s"%dickey)
        # If not data
        if not isData:
            # Retrieve the unique dataset identifier for MC simulation
            result = re.search(r'\.\d{6}\.', ntup_dir)
            if result:
                did = int(result.group()[1:-1])
                dickey = "%s"%(did)
            else:
                print("ERROR \t Could not find did in directory name %s"%dwn_dir)
                continue
        ## If data files are made run-by-run this finds the right period, if not it doesn't do much
        ## Currently, for MC this only returns "mc", but can implement categories for MC too. 
        cat = getCategory(did,isData)

        ## If only want to run over either data or mc
        if not isData and (not "all" in procs and not "mc" in procs) and cat not in procs:
            print("INFO \t Skipping category %s since not specified in arguments"%cat)
            continue
        ##If only want to run over data15 or data16
        elif isData and not "all" in procs:
            found = 0
            for p in procs:
                if p in cat:
                    found = 1
                    break
            if not found:
                print("Skipping %s"%cat)
                continue

        if not cat in all_categories:
            all_categories.append(cat)

        ## If no metadats available skip sample. This can probably be removed when OpenData metadata has been corrected
        if not isData and not dickey in metadata_dic.keys():
            print("WARNING \t Missing metadata information for %s in category %s"%(dickey,cat))
            continue

        ## If simulation we create one ntuple per simulation process (i.e. data set identifier)
        if not isData:
            all_files, metadata = createNtup(cat,nf,hf,uname,did,skim,isData,metadata_dic[dickey],csv)
        ## If data we only collects the files, append a dictionary which we will use later when we will make the ntuples
        else:
            all_files, metadata = createNtup(cat,nf,hf,uname,did,skim,isData)
            
            if not dickey in RDF_spec["samples"].keys():
                RDF_spec["samples"][dickey] = {"trees":["analysis"],
                                               "files":all_files,
                                               "metadata":metadata}
            else:
                print("ERROR \t DSID %s already in dictionary"%dickey)

            doData = True

    # Writes information in dictionary to json-file
    with open(specfile, 'w') as f:
        json.dump(RDF_spec, f)
    # Creates the ntuples for data based on the dictionary
    if doData:   
        mergeSamples(specfile,all_categories,outdir,skim,csv)

    
    

    
