Source code for snowex_db.string_management

"""
Module for functions that interpret various strings encountered in files.
These functions either prep, strip, or interpret strings for headers or
the actual data to be uploaded.
"""
import numpy as np



[docs]
def clean_str(messy):
    """
    Removes unwanted character in a str that we encounter alot
    """
    clean = messy

    # Strip of any chars that are beginning and end
    for ch in [' ', '\n']:
        clean = clean.strip(ch)

    # Remove colons but not when its between numbers (e.g time)
    if ':' in clean:
        work = clean.split(' ')
        result = []

        for w in work:
            s = w.replace(':', '')
            if s.isnumeric():
                result.append(w)

            else:
                result.append(s)

        clean = ' '.join(result)

    # Remove characters anywhere in string that is undesireable
    for ch in ['"', "'"]:
        clean = clean.replace(ch, '')

    clean = clean.strip(' ')
    return clean




[docs]
def standardize_key(messy):
    """
    Preps a key for use in dataframe columns or dictionary. Makes everything
    lowercase, removes units, replaces spaces with underscores.

    Args:
        messy: string to be cleaned
    Returns:
        clean: String minus all characters and patterns of no interest
    """
    key = messy

    # Remove units
    for c in ['()', '[]']:
        key = strip_encapsulated(key, c)

    key = clean_str(key)
    key = key.lower().replace(' ', '_')
    key = key.lower().replace('-', '_')

    # This removes csv byte order mark for files in utf-8 while were encoding with latin
    key = ''.join([c for c in key if c not in 'ï»¿'])

    return key




[docs]
def remap_data_names(original, rename_map):
    """
    Remaps keys in a dictionary according to the rename dictionary. Also can be
    used for lists where the entries in the list can be renamed

    Args:
        original: list/dictionary of names and values that may need remapping
        rename_map: Dictionary mapping names (keys) {old: new}

    Returns:
        new: List/dictionary containing the names remapped

    """
    remap_keys = rename_map.keys()

    if isinstance(original, dict):
        new = {}

        for k, v in original.items():

            if k in remap_keys:
                new_k = rename_map[k]

            # handle multisample names that need changing (e.g.
            # dielectric_constant_a)
            elif k[-2] == '_':
                kw = k[0:-2]
                if kw in remap_keys:
                    new_k = k.replace(kw, rename_map[kw])

            else:
                new_k = k

            new[new_k] = v

    elif isinstance(original, list):
        new = []

        for i, v in enumerate(original):
            if v in remap_keys:
                new.append(rename_map[v])

            # Manage multi samples
            elif len(v) > 2 and '_' in v:
                if v[-2] == '_' and v[0:-2] in remap_keys:
                    new.append(v.replace(v[0:-2], rename_map[v[0:-2]]))
                else:
                    new.append(v)
            else:
                new.append(v)
    else:
        new = original.lower()
        if new in remap_keys:
            new = rename_map[new]

    return new




[docs]
def get_encapsulated(str_line, encapsulator):
    """
    Returns items found in the encapsulator, useful for finding units

    Args:
        str_line: String that has encapusulated info we want removed
        encapsulator: string of characters encapusulating info to be removed
    Returns:
        result: list of strings found inside anything between encapsulators

    e.g.
        line = 'density (kg/m^3), temperature (C)'
        ['kg/m^3', 'C'] = get_encapsulated(line, '()')
    """

    result = []

    if len(encapsulator) > 2:
        raise ValueError('encapsulator can only be 1 or 2 chars long!')

    elif len(encapsulator) == 2:
        lcap = encapsulator[0]
        rcap = encapsulator[1]

    else:
        lcap = rcap = encapsulator

    # Split on the lcap
    if lcap in str_line:
        for i, val in enumerate(str_line.split(lcap)):
            # The first one will always be before our encapsulated
            if i != 0:
                if lcap != rcap:
                    result.append(val[0:val.index(rcap)])
                else:
                    result.append(val)

    return result




[docs]
def strip_encapsulated(str_line, encapsulator):
    """
    Removes from a str anything thats encapusulated by characters and the
    encapsulating chars themselves

    Args:
        str_line: String that has encapusulated info we want removed
        encapsulator: string of characters encapsulating info to be removed
    Returns:
        final: String without anything between encapsulators
    """
    final = str_line
    result = get_encapsulated(final, encapsulator)

    if len(encapsulator) == 2:
        lcap = encapsulator[0]
        rcap = encapsulator[1]

    else:
        lcap = rcap = encapsulator

    # Remove all the encapsulated words
    for v in result:
        final = final.replace(lcap + v + rcap, '')

    # Make sure we remove the last one
    return final




[docs]
def parse_none(value):
    """
    parses values looking for NANs, Nones, etc...

    Args:
        value: Value potentially containing a none or nan

    Returns:
        result: If string value is nan or none, then return None type otherwise
                return original value
    """
    result = value

    # If its a nan or none or the string is empty
    if isinstance(value, str):
        if value.lower() in ['nan', 'none', '-9999', '-9999.0'] or not value:
            result = None
    elif isinstance(value, float) or isinstance(value, int):
        if np.isnan(value) or value == -9999:
            result = None

    return result




[docs]
def kw_in_here(kw, d, case_sensitive=True):
    """
    Determines if the keyword is found in any of the entries in the List
    If any match is found returns true

    Can use a list or dictionary. If a dictionary is supplied the keys will be
    used

    e.g.

    dielectric_constant is found in [temperature, dielectric_constant_a]

    Args:
        kw: Keyword we're searching for
        d: List or dictionary with keys of strings
        case_sensitive: Boolean indicating whether it should be case sensitive
                        or not

    Returns:
        Bool: Indicating the keyword was found

    """
    if isinstance(d, dict):
        d_keys = d.keys()
    else:
        d_keys = d

    if not case_sensitive:
        k = kw.lower()
        d_keys = [c.lower() for c in d_keys]

    else:
        k = kw

    truth = [True for c in d_keys if k in c]
    return len(truth) > 0




[docs]
def get_alpha_ratio(str_line, encapsulator='""'):
    """
    Calculates the ratio of characters to numbers and
    potentially ignore things encapsulated

    Args:
        str_line: String to evaluate
        encapsulator: chars that encapsulate strings to be ignored

    Returns:
        ratio: float ratio of number of letter to number of numbers
    """

    line = str_line
    # Remove any quoted text
    if encapsulator:
        line = strip_encapsulated(str_line, encapsulator='""')
    n_alpha = len([c for c in line if c.isalpha()])
    n_numeric = len([c for c in line if c.isnumeric()])

    if n_numeric == 0:
        ratio = 1
    else:
        ratio = n_alpha / n_numeric

    return ratio




[docs]
def line_is_header(str_line, header_sep=',', header_indicator='#', previous_alpha_ratio=None, expected_columns=None):
    """
    Determine is line 1 is a header line
    """
    # Definitive indication of a header line
    if header_indicator:
        return header_indicator == str_line[0]

    # No immediate answer so build confidence
    matches = []
    if previous_alpha_ratio:
        ratio = get_alpha_ratio(str_line)
        matches.append(ratio >= previous_alpha_ratio)

    if header_sep:
        line = strip_encapsulated(str_line, encapsulator='()')
        matches.append(len(line.split(header_sep)) == expected_columns)

    return matches.count(True) > matches.count(False)