Source code for fermi_stacking.preprocessing.Preprocess

############################################################
# 
# Written by Chris Karwin; April 2022; Clemson University.
# 
# Significant restructuring by Chris Karwin Feb 2024; 
# NASA Goddard Space Flight Center. 
#
# Based on original code from Marco Ajello, Vaidehi Paliya, and Abhishek Desai.
#
# Purpose: Main script for Fermi-LAT stacking analysis.
#
###########################################################

########################################
# Imports
from time import sleep
from random import randint
import resource
import random,shutil,yaml
import os,sys
from math import *
import numpy as np
import matplotlib 
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from fermipy.gtanalysis import GTAnalysis
import gc
import pyLikelihood 
from BinnedAnalysis import *
from astropy.io import fits
import pandas as pd
import math
from scipy.stats import norm
import matplotlib.mlab as mlab
from scipy.ndimage.filters import gaussian_filter
from astropy.convolution import convolve, Gaussian2DKernel
from scipy import stats, interpolate, optimize
from matplotlib import ticker, cm
from scipy.stats.contingency import margins
import fermi_stacking.pop_studies.PopStudies as PS
from IntegralUpperLimit import calc_int
from UpperLimits import UpperLimits
from SummedLikelihood import *
from astropy.io import fits
#######################################

# Superclass:

[docs]
class StackingAnalysis:
   
    """Superclass for conducting Fermi-LAT stacking.
    
    Parameters
    ----------
    sample_file : str
        Full path to sample file. 
    file_type : str
        Type for the sample file. Must be either 'csv', 'fits', or 'tab'.
    column_name : str
        Header of name column in sample file. 
    column_ra : str
        Header of ra column in sample file.
    column_dec : str
        Header of dec column in sample file. 
    run_list : list
        List of names to run (subsample of full sample). For full 
        sample use "default".
    psf_low : int
        Lower PSF iterator for JLA stacking (0-3). Default is 0 for 
        standard analysis and preprocessing.
    psf_high : int
        Upper PSF iterator for JLA stacking (1-4). Default is 1 for 
        standard analysis and preprocessing.
    run_name : str
        Main name of run.
    job_type : str
        "p" for preprocessing or "s" for stacking.
    ft1 : str
        Full path to ft1 photon data file. 
    ft2 : str
        Full path to spacecraft file. 
    galdiff : str
        Full path to Galactic diffuse model.
    isodiff : str
        Full path to isotropic model.
    ltcube : str, optional
        Full path to precomputed ltcube (the default is 'None', in
        which case the ltcube is calculated on the fly).
    use_scratch : bool
        If True, will perform analysis in scratch directory.
    scratch : str
        Full path to sratch directory.
    JLA : bool
        If True will perform joint likelihood analysis, using 4 
        different PSF classes. If False, a standard analysis will
        be performed. 
    irfs : str
        Name of LAT instrument response functions.
    evclass : int
        Event class, i.e. source, clean, etc.
    evtype : int
        Event type, i.e. back/front conversion, etc.
    emin : float or int
        Miminum energy of analysis in MeV. 
    emax : float or int
        Maximum energy of analysis in MeV. 
    tmin : float or int
        Minimum time of analysis in mission elapsed time (MET).
    tmax : float or int
        Maximum time of analysis in mission elapsed time (MET).
    zmax : float or int
        Maximun zenith angle to use in the analysis in degrees. 
    index_min : float
         Minimum index for stacking scan (absolute value).
    index_max : float
        Maximum index for stacking scan (absolute value).
    flux_min : float
        Power of min flux for stacking scan.
    flux_max : float
        Power of max flux for stacking scan.
    num_flux_bins : int
        Number of flux bins to use. 
    alpha_low : float
        Lower bound of alpha range. 
    alpha_high : float
        Upper bound of alpha range.
    alpha_step : float
        Step size of alpha list
    beta_low : float
        Log of Lower bound of beta range,
    beta_high : float
        Log of Upper bound of beta range (non-inclusive)
    beta_step : float
        step size of beta list
    calc_sed : bool
        If True will calculate SED. 
    sed_logEbins : list
        Log of energy bin edges for SED calculation. 
    delete_4fgl : bool
        Option to run 4FGL source. If True, must provide 
        "remove_list.csv" file in run directory, with col1=sample_name, 
        col2=4fgl_name, and no header.
    show_plots : bool
        If True will show plots. Set to False for submitted batch jobs. 

    Note
    ----
    All inputs are passed with inputs.yaml file.
    """


    def __init__(self,input_yaml):

        # Get home directory:
        self.home = os.getcwd()
	
        # Load main inputs from yaml file:
        with open(input_yaml,"r") as file:
            inputs = yaml.load(file,Loader=yaml.FullLoader)

        # Main default inputs:
        self.ft1 = inputs["ft1"] # data file
        self.ft2 = inputs["ft2"] # spacecraft file
        self.galdiff = inputs["galdiff"] # Galactic diffuse model	
        self.isodiff = inputs["isodiff"] # isotropic model

        # ltcube:
        self.ltcube = inputs["ltcube"]
   
        # Scratch directory:
        self.use_scratch = inputs["use_scratch"]
        if self.use_scratch == True:
            self.scratch = inputs["scratch"]
            if(os.path.isdir(self.scratch)==False):
                os.system('mkdir %s' %self.scratch)
      
        # Perform joint likelihood analysis (True) or standard analysis (False):
        self.JLA = inputs["JLA"]
        
        # Main analysis parameters:
        self.run_name = inputs["run_name"]
        self.irfs = inputs["irfs"]
        self.evclass = inputs["evclass"]
        self.evtype = inputs["evtype"]
        self.emin = inputs["emin"]
        self.emax = inputs["emax"]
        self.tmin = inputs["tmin"]
        self.tmax = inputs["tmax"]
        self.zmax = inputs["zmax"]
        self.index_min = inputs["index_min"]
        self.index_max = inputs["index_max"]
        self.flux_min = inputs["flux_min"]
        self.flux_max = inputs["flux_max"]
        self.num_flux_bins = inputs["num_flux_bins"]

        # Parameters for alpha-beta stack
        self.alpha_low = inputs["alpha_low"]
        self.alpha_high = inputs["alpha_high"]
        self.alpha_step = inputs["alpha_step"]
        self.beta_low = inputs["beta_low"]
        self.beta_high = inputs["beta_high"]
        self.beta_step = inputs["beta_step"]
        self.alpha_range = np.arange(self.alpha_low,self.alpha_high,self.alpha_step)
        self.beta_range = np.arange(self.beta_low,self.beta_high,self.beta_step)

        # Option to run 4FGL sources:
        # Note: If True, must provide "remove_list.csv" file in run directory, with col1=sample_name, col2=4fgl_name, and no header.
        self.delete_4fgl = inputs["delete_4fgl"]
        if self.delete_4fgl == True:
            df = pd.read_csv("remove_list.csv",names=["col0","col1"])
            self.delete_sample_name = np.array(df["col0"].tolist())
            self.delete_4fgl_name = np.array(df["col1"].tolist())
        
        # Additional options:
        self.show_plots = inputs["show_plots"]
        self.calc_sed = inputs["calc_sed"]
        self.logEbins = inputs["sed_logEbins"]

        # Sample data:
        self.sample_file = inputs["sample_file"]
        self.file_type = inputs["file_type"]
        self.column_name = inputs["column_name"]
        
        if self.file_type == "fits":
            hdu = fits.open(self.sample_file)
            data = hdu[1].data
            self.sample_name_list = data[self.column_name].strip().tolist()

        if self.file_type == "csv":
            df = pd.read_csv(self.sample_file)
            self.sample_name_list = df[self.column_name].tolist()

        if self.file_type == "tab":
            df = pd.read_csv(self.sample_file, delim_whitespace=True)
            self.sample_name_list = df[self.column_name].tolist()

        # Science Tools:
        self.nxpix = inputs["nxpix"]
        self.nypix = inputs["nypix"]
        self.xref = inputs["xref"]
        self.yref = inputs["yref"]
        self.binsz = inputs["binsz"]
        self.coordsys = inputs["coordsys"]
        self.enumbins = inputs["enumbins"]
        self.proj = inputs["proj"]
        self.reduced_x = inputs["reduced_x"]
        self.reduced_y = inputs["reduced_y"]
        self.path = inputs["path"]
        self.ROI_RA = inputs["ROI_RA"]
        self.ROI_DEC = inputs["ROI_DEC"]
        self.ROI_radius = inputs["ROI_radius"]


[docs]
    def ang_sep(self,ra0, dec0, ra1, dec1):
       
        """Calculate angular distance between two points on the sky.

        Parameters
        ----------
        ra0 : float
            Right ascension of first source in degrees. 
        dec0 : float
            Declination of first source in degrees.
        ra1 : float
            Right ascension of second source in degrees.
        dec1 : float
            Declination of first source in degrees.
        
        Returns
        -------
        float
            Angular distance between two sources in degrees. 
        """
        
        C = np.pi/180.
        d0 = C * dec0
        d1 = C * dec1
        r12 = C * (ra0 - ra1)
        cd0 = np.cos(d0)
        sd0 = np.sin(d0)
        cd1 = np.cos(d1)
        sd1 = np.sin(d1)
        cr12 = np.cos(r12)
        sr12 = np.sin(r12)
        num = np.sqrt((cd0 * sr12) ** 2 + (cd1 * sd0 - sd1 * cd0 * cr12) ** 2)
        den = sd0 * sd1 + cd0 * cd1 * cr12
        
        return np.arctan2(num, den) / C



[docs]
    def run_preprocessing(self,srcname,ra,dec,components=None):
       
        """Perform preprocessing of source.

        Parameters
        ---------
        srcname : str 
            Name of source.
        ra : float
            Right ascension of source. 
        dec : float
            Declination of source. 
        components : list of strings, optional
            Specify components of analysis via a list, where each 
            entry in the list is a line that is to be written in the 
            yaml file. Must include new line command at the end of 
            each line. 
        """

	# Make print statement:
        print()
        print("Running preprocessing...")
        print()
 
        # Need to specify coordinates as type float
        ra = float(ra)
        dec = float(dec)

        # Define main output directory for source:
        preprocess_output = os.path.join(self.home,"Preprocessed_Sources")
        if(os.path.isdir(preprocess_output)==False):
            os.system('mkdir %s' %preprocess_output)

        # Remove src output directory if it already exists:
        src_output_main = os.path.join(preprocess_output,srcname)
        if(os.path.isdir(src_output_main)==True):
            shutil.rmtree(src_output_main)
        # Make src output directory:
        os.system('mkdir %s' %src_output_main)

        # Define scratch directory for source:
        if self.use_scratch == True:
            preprocess_scratch = os.path.join(self.scratch,"Preprocessing")
            if(os.path.isdir(preprocess_scratch)==False):
                os.system('mkdir %s' %preprocess_scratch)
            src_scratch = os.path.join(preprocess_scratch,srcname)
            if(os.path.isdir(src_scratch)==True):
                shutil.rmtree(src_scratch)
            os.system('mkdir %s' %src_scratch)
            os.chdir(src_scratch)

        if self.use_scratch == False:
             os.chdir(src_output_main)
        	
        with open('%s.yaml' % srcname, 'w') as yml:
            yml.write("logging:\n")
            yml.write("  verbosity : 3\n")
            yml.write("  chatter : 3\n")
            yml.write("#--------#\n")
            yml.write("fileio:\n")
            yml.write("  outdir : output\n")
            yml.write("  logfile : %s\n" %srcname)
            yml.write("  usescratch : False\n")
            yml.write("  scratchdir : scratch\n")
            yml.write("#--------#\n")
            yml.write("data:\n")
            yml.write("  evfile : '%s'\n" %self.ft1)
            yml.write("  scfile : '%s'\n" %self.ft2)
            if self.ltcube != "None":
                yml.write("  ltcube : '%s'\n" %self.ltcube)
            yml.write("#--------#\n")
            yml.write("binning:\n")
            yml.write("  roiwidth : 10\n")
            yml.write("  binsz : 0.08\n")
            yml.write("  binsperdec : 8\n")
            yml.write("#--------#\n")
            yml.write("selection:\n")
            yml.write("  emin : %s\n" %self.emin)
            yml.write("  emax : %s\n" %self.emax)
            yml.write("  zmax : %s\n" %self.zmax)
            yml.write("  target : '%s'\n" %srcname)
            yml.write("  radius : 15\n")
            yml.write("  tmin : %s\n" %self.tmin)
            yml.write("  tmax : %s\n" %self.tmax)
            yml.write("  evclass : %s\n" %self.evclass)
            yml.write("  evtype : %s\n" %self.evtype)
            yml.write("  filter : 'DATA_QUAL>0 && LAT_CONFIG==1'\n")
            yml.write("#--------#\n")
            yml.write("gtlike:\n")  
            yml.write("  edisp : True\n")
            yml.write("  edisp_disable : ['isodiff']\n")
            yml.write("  irfs : '%s'\n" %self.irfs)
            yml.write("#--------#\n")
            yml.write("model:\n")
            yml.write("  src_radius : 15\n")
            yml.write("  src_roiwidth : 15\n")
            yml.write("  galdiff : '%s'\n" %self.galdiff)
            yml.write("  isodiff : '%s'\n" %self.isodiff)
            yml.write("  catalogs :\n")
            yml.write("    - '4FGL-DR3'\n")
            yml.write("  extdir : '/zfs/astrohe/Software/fermipy_source/lib/python3.9/site-packages/fermipy-1.1.3+2.g21485-py3.9.egg/fermipy/data/catalogs/Extended_12years/'\n")
            yml.write("  sources :\n")
            yml.write("    - { 'name' : '%s', 'ra' : %s, 'dec' : %s, 'SpectrumType' : PowerLaw }\n" %(srcname,ra,dec))
            yml.write("#--------#\n")
            yml.write("plotting:\n")
            yml.write("  format : png\n")
            yml.write("#--------#\n")
            yml.write("sed:\n")
            yml.write("  use_local_index : True\n")
	        
            # Include components for joint likelihood analysis (JLA):
            if self.JLA == True and components == None:
                yml.write("components:\n")
                yml.write("  - { model: {isodiff: iso_P8R3_SOURCE_V3_PSF0_v1.txt},\n")
                yml.write("      selection : { evtype : 4 } }\n")
                yml.write("  - { model: {isodiff: iso_P8R3_SOURCE_V3_PSF1_v1.txt},\n")
                yml.write("      selection : { evtype : 8 } }\n")
                yml.write("  - { model: {isodiff: iso_P8R3_SOURCE_V3_PSF2_v1.txt},\n")
                yml.write("      selection : { evtype : 16 } }\n")
                yml.write("  - { model: {isodiff: iso_P8R3_SOURCE_V3_PSF3_v1.txt},\n")
                yml.write("      selection : { evtype : 32 } }\n")
        
            # Option to write components from list:
            
            # Require user to specify JLA = True when passing components:
            if self.JLA == False and components != None:
                print("ERROR: JLA must be True to pass component list.")
                sys.exit()

            # Read components from list:  
            if self.JLA == True and components != None:
                yml.write("components:\n")
                for each in components:
                    yml.write(each)

        yml.close()
	
        gta = GTAnalysis('%s.yaml' % srcname,logging={'verbosity' : 3})
        gta.setup()
    
        # if rerunning 4fgl source, first delete source from model:
        if self.delete_4fgl == True:
            delete_sample_name = self.delete_sample_name
            delete_4fgl_name = self.delete_4fgl_name
            if srcname in delete_sample_name:
                this_index = srcname == delete_sample_name
                this_name = delete_4fgl_name[this_index][0]
                gta.delete_source(this_name)

        gta.optimize()
        gta.print_roi()
        gta.free_source('galdiff')
        gta.free_source('isodiff')
        gta.free_sources(minmax_ts=[25,None],pars='norm', distance=5.0)
        gta.free_sources(minmax_ts=[500,None],distance=7.0)
        gta.free_source(srcname)
        gta.fit()
        gta.write_roi('fit_model_1')
        gta.delete_source(srcname)
        model2 = {'Index' : 2.0, 'SpatialModel' : 'PointSource'}
        finder2 = gta.find_sources(prefix='find2',model=model2,sqrt_ts_threshold=4.0,
				min_separation=0.5,max_iter=10,sources_per_iter=20,
				tsmap_fitter='tsmap')
        gta.write_roi('fit_model_2')

        names2    = np.array([finder2['sources'][i]['name'] for i in range(len(finder2['sources']))])
        ra_ns2    = np.array([finder2['sources'][i]['ra'] for i in range(len(finder2['sources']))])
        dec_ns2   = np.array([finder2['sources'][i]['dec'] for i in range(len(finder2['sources']))])
        r95_ns2   = np.array([finder2['sources'][i]['pos_r95'] for i in range(len(finder2['sources']))])
        dist_sep = '%.2f' %0
        namee=''
	
	# Check ang separation if more than one source is found close to ra and dec
        if len(names2)>0:
            for i in range(len(names2)):
                sepp2=self.ang_sep(ra_ns2[i],dec_ns2[i],ra,dec)
                if sepp2 < r95_ns2[i] and sepp2 < 0.2:
                    print(names2[i],sepp2,r95_ns2[i])
                    dist_sep = '%.2f' %sepp2
                    namee = names2[i]
        if namee:                   
            # Write name of replaced source:
            f = open("output/replaced_source_name.txt","w")
            f.write(str([srcname,namee]))
            f.close()
            srcname=namee
        else:
            gta.add_source(srcname,{ 'ra' : ra, 'dec' : dec,
        		'SpectrumType' : 'PowerLaw', 'Index' : 2.0,
        		'Scale' : 1000, 'Prefactor' : 1e-11,
        		'SpatialModel' : 'PointSource' })
        gta.optimize()
        gta.print_roi()
        gta.free_source('galdiff')
        gta.free_sources(minmax_ts=[500,None],distance=7.0)
        gta.free_source(srcname)
        gta.fit()
        gta.write_roi('fit_model_3')
        
        # Calculate SED:
        if self.calc_sed == True:
            gta.sed(srcname,loge_bins=self.logEbins)

        p = np.load('output/fit_model_3.npy',allow_pickle=True).flat[0]
        src = p['sources'][srcname]
        if str(src['ts'])=='nan':	# To check non-convergence
            print('****************************')
            print('Fit has not converged')
            print('****************************')
            gta.free_sources(minmax_ts=[None,100],free=False)
            gta.free_source(srcname)
            gta.fit(tol=1e-8)
            gta.write_roi('fit_model_3')
            p = np.load('output/fit_model_3.npy',allow_pickle=True).flat[0]
            src = p['sources'][srcname]
        else:
            print('****************************')
            print('Fit has converged')
            print('****************************')

        if src['ts'] > 15 and np.fabs(src['param_values'][1]) > 3.0:
            model4 = {'Index' : 2.8, 'SpatialModel' : 'PointSource'}
            print('****************************')
            print('Source still bad TS=%s Index=%s?' %(src['ts'],src['param_values'][1]))
            print('****************************')
            gta.delete_source(srcname)
            mapp=gta.tsmap('fit_no_source_final',model=model4)
            finder4 = gta.find_sources(prefix='find4',model=model4,sqrt_ts_threshold=4.0,
                      min_separation=1.0,max_iter=10,sources_per_iter=20,
                      tsmap_fitter='tsmap')
            gta.add_source(srcname,{ 'ra' : ra, 'dec' : dec,
                	'SpectrumType' : 'PowerLaw', 'Index' : 2.0,
                	'Scale' : 1000, 'Prefactor' : 1e-11,
                	'SpatialModel' : 'PointSource' })
            gta.free_sources(minmax_ts=[100,None],pars='norm',distance=5.0)
            gta.free_sources(minmax_ts=[200,None],distance=7.0)
            gta.free_source(srcname)
            gta.fit()
            gta.write_roi('fit_model_3')
            p = np.load('output/fit_model_3.npy',allow_pickle=True).flat[0]
            src = p['sources'][srcname]

        TS = '%.2f' %src['ts']
        Flux = '%.2e' %src['eflux']
        Flux_err = '%.2e' %src['eflux_err']
        Flux_UL = '%.2e' %src['eflux_ul95']
        Index = '%.2f' %(np.fabs(src['param_values'][1]))
        Index_err = '%.2f' %src['param_errors'][1]
        f = open('%s_Param.txt' % srcname, 'w')
        f.write(str(srcname) + "\t" + str(dist_sep) + "\t" + str(Flux) + "\t" + str(Flux_err) + "\t" + str(Index) + "\t" + str(Index_err) + "\t" + str(Flux_UL) + "\t" + str(TS) + "\n")
        f.close()
	 
        # Calculate likelihood for null hypothesis:

        if self.JLA == False:
            iteration_list = [0]
        if self.JLA == True:
            iteration_list = [0,1,2,3]
        for j in iteration_list:

            srcmap = 'output/srcmap_0%s.fits' %j
            bexpmap = 'output/bexpmap_0%s.fits' %j
            xmlfile = 'output/fit_model_3_0%s.xml' %j
            savexml = 'output/null_likelihood_%s.xml'%j
            savetxt = 'output/null_likelihood_%s.txt' %j
            
            # Check for ltcube:
            if os.path.isfile(self.ltcube) == False:
                self.ltcube = 'output/ltcube_0%s.fits' %str(j)
                if os.path.isfile(self.ltcube) == False:
                    print("ERROR: no ltcube found.")
                    sys.exit()

            obs = BinnedObs(srcMaps=srcmap,expCube=self.ltcube,binnedExpMap=bexpmap,irfs='%s' %self.irfs)
            like = BinnedAnalysis(obs,xmlfile,optimizer='Minuit') 
            like.deleteSource(srcname)
            freeze=like.freeze
            for k in range(len(like.model.params)):
                freeze(k)
            like.model['galdiff'].funcs['Spectrum'].getParam('Prefactor').setFree(True)
            like.model['galdiff'].funcs['Spectrum'].getParam('Index').setFree(True)
            like.model['isodiff'].funcs['Spectrum'].getParam('Normalization').setFree(True)
            likeobj = pyLike.Minuit(like.logLike)
            thisL = like.fit(verbosity=1,covar=True,optObject=likeobj) #this returns -logL
            value = like.logLike.value() #this returns logL
            like.logLike.writeXml(savexml)

            print("*********")
            print("-logL: " + str(thisL))
            print() 
	
            f = open(savetxt,'w')
            f.write(str(value))
            f.close()

        if self.use_scratch == True:
            
            shutil.copytree(src_scratch,src_output_main,dirs_exist_ok=True)
    
        os.chdir(self.home)

        return



[docs]
    def make_preprocessing_summary(self):

        """Makes sumarray of preprocessing."""

        # Construct empty dataframe:
        df_full = pd.DataFrame(data=None, columns=["name","dist_sep","flux","flux_err","index","index_err","flux_ul","TS"])

        # Fill dataframe:
        missing_list = []
        for each in self.sample_name_list:
	
            srcname = each
            
            # Check for updated name:
            indir = os.path.join(self.home,"Preprocessed_Sources",srcname,"output")
            new_name_file = os.path.join(indir,"replaced_source_name.txt")
            replace_name = False
            if os.path.exists(new_name_file) == True:
                f = open(new_name_file,"r")
                this_list = eval(f.read())
                newsrcname = this_list[1]
                replace_name = True

            src_file = "%s/%s_Param.txt" % (srcname,srcname)
            wdir = os.path.join(self.home,"Preprocessed_Sources",src_file) 
	    
            if os.path.exists(wdir) == False and replace_name == True:
                src_file = "%s/%s_Param.txt" % (srcname,newsrcname)
                wdir = os.path.join(self.home,"Preprocessed_Sources",src_file)

            if os.path.exists(wdir) == False:
                missing_list.append(srcname)
                print() 
                print("Does not exists: " + srcname)
                print() 
		
            if os.path.exists(wdir) == True:
                this_file = wdir
                df = pd.read_csv(this_file,sep='\s+',names=["name","dist_sep","flux","flux_err","index","index_err","flux_ul","TS"])
                df_full = pd.concat([df,df_full]).reset_index(drop=True)

        df_full = df_full.sort_values(by=["TS"],ascending=False).reset_index(drop=True)

        print()
        print(df_full)
        print()

        # Write dataframe to text file:
        f = open("Preprocessed_Sources/preprocessing_summary_LLAGN.txt","w")
        f.write("\n")
        f.write("*****************\n")
        f.write("preprocessing summary:\n\n")
        f.write(df_full.to_string(index=True))
        f.write("\n\n")
        f.write("****************\n")
        f.write("Missing sources:")
        f.write(str(missing_list))
        f.close()

        df_full.to_csv("Preprocessed_Sources/preprocessing_summary_LLAGN.csv",sep ="\t",index=False)
    
        return