Source code for ILAMB.ModelResult

from .Variable import Variable
from netCDF4 import Dataset
from . import ilamblib as il
from cf_units import Unit
import numpy as np
import glob,os,re
from mpi4py import MPI
import logging

logger = logging.getLogger("%i" % MPI.COMM_WORLD.rank)

def _skipFile(pathName,altvars,lats,lons,same_site_epsilon):
    """Some simple logic intended to help speed up models which consist of
       many single site runs.
    """
    if lats is None: return False
    if lats.size > 1: return False
    with Dataset(pathName) as dset:
        for v in altvars:
            if v in dset.variables:
                V = dset.variables[v]
                D = V.dimensions[-1]
                if D not in ['data','lndgrid']: continue
                if dset.dimensions[D].size > 1: continue
                X = dset.variables['lat'][...]
                Y = dset.variables['lon'][...]
                Y = (Y<=180)*Y + (Y>180)*(Y-360) + (Y<-180)*360
                if (np.sqrt((X-lats[0])**2+(Y-lons[0])**2) > same_site_epsilon): return True

    return False

[docs]class ModelResult():
    """A class for exploring model results.

    This class provides a simplified way of accessing model
    results. It is essentially a pointer to a top level directory and
    defines the model as all netCDF4 files found in its
    subdirectories. If this directory contains model output from
    several runs or experiments, you may wish to specify a string (the
    *filter* argument) which we will require to be in the filename for
    it to be considered part of the model.

    Parameters
    ----------
    path : str
        the full path to the directory which contains the model result
        files
    modelname : str, optional
        a string representing the name of the model, will be used as a
        label in plot legends
    color : 3-tuple, optional
        a normalized tuple representing a color in RGB color space,
        will be used to color line plots
    filter : str, optional
        this string must be in file's name for it to be considered as
        part of the model results
    model_year : 2-tuple of int, optional
        used to shift model times, all model years at model_year[0]
        are shifted to model_year[1]
    """
    def __init__(self,path,modelname="unamed",color=(0,0,0),filter="",regex="",model_year=None,paths=None,description="",group=""):
        self.path           = path
        self.color          = color
        self.filter         = filter
        self.regex          = regex
        self.shift          = 0.
        if model_year is not None: self.shift = (model_year[1]-model_year[0])*365.
        self.name           = modelname
        self.confrontations = {}
        self.cell_areas     = None
        self.land_fraction  = None
        self.land_areas     = None
        self.land_area      = None
        self.variables      = None
        self.names          = None
        self.extents        = np.asarray([[-90.,+90.],[-180.,+180.]])
        self.paths          = paths
        self.description    = description
        self.group          = group
        self._findVariables()
        self._getGridInformation()

    def __str__(self):
        s  = ""
        s += "ModelResult: %s\n" % self.name
        s += "-"*(len(self.name)+13) + "\n"
        for key in self.names:
            s += "{0:>20}: {1:<50}".format(key,self.names[key]) + "\n"
        return s

    def _findVariables(self):
        """Loops through the netCDF4 files in a model's path and builds a dictionary of which variables are in which files.
        """
        def _get(key,dset):
            dim_name = key
            try:
                v = dset.variables[key]
                dim_bnd_name = v.getncattr("bounds")
            except:
                dim_bnd_name = None
            return dim_name,dim_bnd_name

        variables = {}
        names     = {}
        paths     = self.paths if self.paths is not None else [self.path]
        experiment_id = None
        source_id = None
        for path in paths:
            for subdir, dirs, files in os.walk(path,followlinks=True):
                for fileName in files:
                    if not fileName.endswith(".nc"): continue
                    if self.filter not in fileName: continue
                    if self.regex is not "":
                        m = re.search(self.regex,fileName)
                        if not m: continue
                    pathName  = os.path.join(subdir,fileName)

                    try:
                        dataset = Dataset(pathName)
                    except:
                        logger.debug("[%s] Error opening file %s" % (self.name,pathName))
                        continue

                    # harvest some model information
                    if experiment_id is None and "experiment_id" in dataset.ncattrs():
                        experiment_id = dataset.experiment_id
                    if source_id is None and "source_id" in dataset.ncattrs():
                        source_id = dataset.source_id
                        
                    # populate dictionary for which variables are in which files
                    for key in dataset.variables.keys():
                        if key not in variables:
                            variables[key] = []
                        variables[key].append(pathName)

                        v = dataset.variables[key]
                        if key not in names:
                            if "long_name" in v.ncattrs():
                                names[key] = v.long_name
                                continue
                            if "standard_name" in v.ncattrs():
                                names[key] = v.standard_name
                                continue

        # modify the description if none exists
        if self.description == "":
            d = []
            if source_id is not None: d.append(source_id)
            if experiment_id is not None: d.append(experiment_id)
            self.description = " ".join(d)
            
        # determine spatial extents
        lats = [key for key in variables.keys() if (key.lower().startswith("lat" ) or
                                                    key.lower().  endswith("lat" ))]
        lons = [key for key in variables.keys() if (key.lower().startswith("lon" ) or
                                                    key.lower().  endswith("lon" ) or
                                                    key.lower().startswith("long") or
                                                    key.lower().  endswith("long"))]
        for key in lats:
            for pathName in variables[key]:
                with Dataset(pathName) as dset:
                    lat = dset.variables[key][...]
                    if lat.size == 1: continue
                    self.extents[0,0] = max(self.extents[0,0],lat.min())
                    self.extents[0,1] = min(self.extents[0,1],lat.max())
        for key in lons:
            for pathName in variables[key]:
                with Dataset(pathName) as dset:
                    lon = dset.variables[key][...]
                    if lon.size == 1: continue
                    if lon.ndim < 1 or lon.ndim > 2: continue
                    lon = (lon<=180)*lon + (lon>180)*(lon-360) + (lon<-180)*360
                    self.extents[1,0] = max(self.extents[1,0],lon.min())
                    self.extents[1,1] = min(self.extents[1,1],lon.max())

        # fix extents
        eps = 5.
        if self.extents[0,0] < (- 90.+eps): self.extents[0,0] = - 90.
        if self.extents[0,1] > (+ 90.-eps): self.extents[0,1] = + 90.
        if self.extents[1,0] < (-180.+eps): self.extents[1,0] = -180.
        if self.extents[1,1] > (+180.-eps): self.extents[1,1] = +180.
        self.variables = variables
        self.names = names

    def _getGridInformation(self):
        """Looks in the model output for cell areas as well as land fractions.
        """
        def _shiftLon(lon):
            return (lon<=180)*lon + (lon>180)*(lon-360) + (lon<-180)*360
        
        # Are there cell areas associated with this model?
        area_name = None
        area_name = "area"      if "area"      in self.variables.keys() else area_name
        area_name = "areacella" if "areacella" in self.variables.keys() else area_name
        if area_name is not None:
            with Dataset(self.variables[area_name][0]) as f:
                A = f.variables[area_name]
                unit = Unit(A.units) if "units" in A.ncattrs() else Unit("m2")
                self.cell_areas = unit.convert(A[...],"m2",inplace=True)
        else:
            if not ("lat_bnds" in self.variables.keys() and
                    "lon_bnds" in self.variables.keys()): return
            with Dataset(self.variables["lat_bnds"][0]) as f:
                x = f.variables["lat_bnds"][...]
            with Dataset(self.variables["lon_bnds"][0]) as f:
                y = f.variables["lon_bnds"][...]
                s = y.mean(axis=1).argmin()
                y = np.roll(_shiftLon(y),-s,axis=0)
                if y[ 0,0] > y[ 0,1]: y[ 0,0] = -180.
                if y[-1,0] > y[-1,1]: y[-1,1] = +180.
            self.cell_areas = il.CellAreas(None,None,lat_bnds=x,lon_bnds=y)
            
        # Now we do the same for land fractions
        frac_name = None
        frac_name = "landfrac" if "landfrac" in self.variables.keys() else frac_name
        frac_name = "sftlf"    if "sftlf"    in self.variables.keys() else frac_name
        if frac_name is None:
            self.land_areas = self.cell_areas
        else:
            with Dataset(self.variables[frac_name][0]) as f:
                self.land_fraction = f.variables[frac_name][...]
            # some models represent the fraction as a percent
            if np.ma.max(self.land_fraction) > 10: self.land_fraction *= 0.01
            with np.errstate(over='ignore',under='ignore'):
                if not np.allclose(self.cell_areas.shape,self.land_fraction.shape):
                    msg = "The model %s has areacella %s which is a different shape than sftlf %s" % (self.name,
                                                                                                      str(self.cell_areas.shape),
                                                                                                      str(self.land_fraction.shape))
                    raise ValueError(msg)
                self.land_areas = self.cell_areas*self.land_fraction
        self.land_area = np.ma.sum(self.land_areas)
        return

[docs]    def extractTimeSeries(self,variable,lats=None,lons=None,alt_vars=[],initial_time=-1e20,final_time=1e20,output_unit="",expression=None,convert_calendar=True):
        """Extracts a time series of the given variable from the model.

        Parameters
        ----------
        variable : str
            name of the variable to extract
        alt_vars : list of str, optional
            alternate variables to search for if *variable* is not found
        initial_time : float, optional
            include model results occurring after this time
        final_time : float, optional
            include model results occurring before this time
        output_unit : str, optional
            if specified, will try to convert the units of the variable
            extract to these units given.
        lats : numpy.ndarray, optional
            a 1D array of latitude locations at which to extract information
        lons : numpy.ndarray, optional
            a 1D array of longitude locations at which to extract information
        expression : str, optional
            an algebraic expression describing how to combine model outputs

        Returns
        -------
        var : ILAMB.Variable.Variable
            the extracted variable

        """
        # prepend the target variable to the list of possible variables
        altvars = list(alt_vars)
        altvars.insert(0,variable)

        # checks on input consistency
        if lats is not None: assert lons is not None
        if lons is not None: assert lats is not None
        if lats is not None: assert lats.shape == lons.shape

        # create a list of datafiles which have a non-null intersection
        # over the desired time range
        V = []
        tmin =  1e20
        tmax = -1e20
        same_site_epsilon = 0.5
        for v in altvars:
            if v not in self.variables: continue
            for ifile,pathName in enumerate(self.variables[v]):
                if _skipFile(pathName,altvars,lats,lons,same_site_epsilon): continue
                var = Variable(filename       = pathName,
                               variable_name  = variable,
                               alternate_vars = altvars[1:],
                               area           = self.land_areas,
                               convert_calendar = convert_calendar,
                               t0             = initial_time - self.shift,
                               tf             = final_time   - self.shift)
                if var.time is None: continue
                tmin = min(tmin,var.time_bnds.min())
                tmax = max(tmax,var.time_bnds.max())
                if ((var.time_bnds.max() < initial_time - self.shift) or
                    (var.time_bnds.min() >   final_time - self.shift)): continue
                if lats is not None and var.ndata:
                    r = np.sqrt((lats[:,np.newaxis]-var.lat)**2 +
                                (lons[:,np.newaxis]-var.lon)**2)
                    imin = r.argmin(axis=1)
                    rmin = r.   min(axis=1)
                    imin = imin[np.where(rmin<same_site_epsilon)]
                    if imin.size == 0:
                        continue
                    var.lat   = var.lat [  imin]
                    var.lon   = var.lon [  imin]
                    var.data  = var.data[:,imin]
                    var.ndata = var.data.shape[1]
                if lats is not None and var.spatial: var = var.extractDatasites(lats,lons)
                var.time      += self.shift
                var.time_bnds += self.shift
                V.append(var)
            if len(V) > 0: break

        # If we didn't find any files, try to put together the
        # variable from a given expression
        if len(V) == 0:
            if expression is not None:
                v = self.derivedVariable(variable,
                                         expression,
                                         lats         = lats,
                                         lons         = lons,
                                         initial_time = initial_time,
                                         final_time   = final_time)
            else:
                tstr = ""
                if tmin < tmax: tstr = " in the given time frame, tinput = [%.1f,%.1f], tmodel = [%.1f,%.1f]" % (initial_time,final_time,tmin+self.shift,tmax+self.shift)
                logger.debug("[%s] Could not find [%s] in the model results%s" % (self.name,",".join(altvars),tstr))
                raise il.VarNotInModel()
        else:
            v = il.CombineVariables(V)


        return v

[docs]    def derivedVariable(self,variable_name,expression,lats=None,lons=None,initial_time=-1e20,final_time=1e20,convert_calendar=True):
        """Creates a variable from an algebraic expression of variables in the model results.

        Parameters
        ----------
        variable_name : str
            name of the variable to create
        expression : str
            an algebraic expression describing how to combine model outputs
        initial_time : float, optional
            include model results occurring after this time
        final_time : float, optional
            include model results occurring before this time
        lats : numpy.ndarray, optional
            a 1D array of latitude locations at which to extract information
        lons : numpy.ndarray, optional
            a 1D array of longitude locations at which to extract information

        Returns
        -------
        var : ILAMB.Variable.Variable
            the new variable

        """
        from sympy import sympify
        if expression is None: raise il.VarNotInModel()
        args  = {}
        units = {}
        unit  = expression
        mask  = None
        time  = None
        tbnd  = None
        lat   = None
        lon   = None
        ndata = None
        area  = None
        depth = None
        dbnds = None

        # first pass to make sure all variables are defined in the interval
        free_symbols = sympify(expression).free_symbols
        Vs = {}; t0 = initial_time; tf = final_time
        for arg in free_symbols:
            Vs[arg] = self.extractTimeSeries(arg.name,
                                             lats = lats,
                                             lons = lons,
                                             convert_calendar = convert_calendar,
                                             initial_time = initial_time,
                                             final_time   = final_time)
            t0 = max(Vs[arg].time_bnds[ 0,0],t0)
            tf = min(Vs[arg].time_bnds[-1,1],tf)

        for arg in free_symbols:
            var = Vs[arg].trim(t=[t0,tf])
            units[arg.name] = var.unit
            args [arg.name] = var.data.data

            if mask is None:
                mask  = var.data.mask
            else:
                mask += var.data.mask
            if time is None:
                time  = var.time
            else:
                assert(np.allclose(time,var.time))
            if tbnd is None:
                tbnd  = var.time_bnds
            else:
                assert(np.allclose(tbnd,var.time_bnds))
            if lat is None:
                lat  = var.lat
            else:
                assert(np.allclose(lat,var.lat))
            if lon is None:
                lon  = var.lon
            else:
                assert(np.allclose(lon,var.lon))
            if area is None:
                area  = var.area
            else:
                assert(np.allclose(area,var.area))
            if ndata is None:
                ndata  = var.ndata
            else:
                assert(np.allclose(ndata,var.ndata))
            if depth is None:
                depth  = var.depth
            else:
                assert(np.allclose(depth,var.depth))
            if dbnds is None:
                dbnds  = var.depth_bnds
            else:
                assert(np.allclose(dbnds,var.depth_bnds))

        np.seterr(divide='ignore',invalid='ignore')
        result,unit = il.SympifyWithArgsUnits(expression,args,units)
        np.seterr(divide='raise',invalid='raise')
        mask  += np.isnan(result)
        result = np.ma.masked_array(np.nan_to_num(result),mask=mask)

        return Variable(data       = np.ma.masked_array(result,mask=mask),
                        unit       = unit,
                        name       = variable_name,
                        time       = time,
                        time_bnds  = tbnd,
                        lat        = lat,
                        lon        = lon,
                        area       = area,
                        ndata      = ndata,
                        depth      = depth,
                        depth_bnds = dbnds)