Source code for snowex_db.string_management

"""
Module for functions that interpret various strings encountered in files.
These functions either prep, strip, or interpret strings for headers or
the actual data to be uploaded.
"""
import numpy as np


[docs] def clean_str(messy): """ Removes unwanted character in a str that we encounter alot """ clean = messy # Strip of any chars that are beginning and end for ch in [' ', '\n']: clean = clean.strip(ch) # Remove colons but not when its between numbers (e.g time) if ':' in clean: work = clean.split(' ') result = [] for w in work: s = w.replace(':', '') if s.isnumeric(): result.append(w) else: result.append(s) clean = ' '.join(result) # Remove characters anywhere in string that is undesireable for ch in ['"', "'"]: clean = clean.replace(ch, '') clean = clean.strip(' ') return clean
[docs] def standardize_key(messy): """ Preps a key for use in dataframe columns or dictionary. Makes everything lowercase, removes units, replaces spaces with underscores. Args: messy: string to be cleaned Returns: clean: String minus all characters and patterns of no interest """ key = messy # Remove units for c in ['()', '[]']: key = strip_encapsulated(key, c) key = clean_str(key) key = key.lower().replace(' ', '_') key = key.lower().replace('-', '_') # This removes csv byte order mark for files in utf-8 while were encoding with latin key = ''.join([c for c in key if c not in '']) return key
[docs] def remap_data_names(original, rename_map): """ Remaps keys in a dictionary according to the rename dictionary. Also can be used for lists where the entries in the list can be renamed Args: original: list/dictionary of names and values that may need remapping rename_map: Dictionary mapping names (keys) {old: new} Returns: new: List/dictionary containing the names remapped """ remap_keys = rename_map.keys() if isinstance(original, dict): new = {} for k, v in original.items(): if k in remap_keys: new_k = rename_map[k] # handle multisample names that need changing (e.g. # dielectric_constant_a) elif k[-2] == '_': kw = k[0:-2] if kw in remap_keys: new_k = k.replace(kw, rename_map[kw]) else: new_k = k new[new_k] = v elif isinstance(original, list): new = [] for i, v in enumerate(original): if v in remap_keys: new.append(rename_map[v]) # Manage multi samples elif len(v) > 2 and '_' in v: if v[-2] == '_' and v[0:-2] in remap_keys: new.append(v.replace(v[0:-2], rename_map[v[0:-2]])) else: new.append(v) else: new.append(v) else: new = original.lower() if new in remap_keys: new = rename_map[new] return new
[docs] def get_encapsulated(str_line, encapsulator): """ Returns items found in the encapsulator, useful for finding units Args: str_line: String that has encapusulated info we want removed encapsulator: string of characters encapusulating info to be removed Returns: result: list of strings found inside anything between encapsulators e.g. line = 'density (kg/m^3), temperature (C)' ['kg/m^3', 'C'] = get_encapsulated(line, '()') """ result = [] if len(encapsulator) > 2: raise ValueError('encapsulator can only be 1 or 2 chars long!') elif len(encapsulator) == 2: lcap = encapsulator[0] rcap = encapsulator[1] else: lcap = rcap = encapsulator # Split on the lcap if lcap in str_line: for i, val in enumerate(str_line.split(lcap)): # The first one will always be before our encapsulated if i != 0: if lcap != rcap: result.append(val[0:val.index(rcap)]) else: result.append(val) return result
[docs] def strip_encapsulated(str_line, encapsulator): """ Removes from a str anything thats encapusulated by characters and the encapsulating chars themselves Args: str_line: String that has encapusulated info we want removed encapsulator: string of characters encapsulating info to be removed Returns: final: String without anything between encapsulators """ final = str_line result = get_encapsulated(final, encapsulator) if len(encapsulator) == 2: lcap = encapsulator[0] rcap = encapsulator[1] else: lcap = rcap = encapsulator # Remove all the encapsulated words for v in result: final = final.replace(lcap + v + rcap, '') # Make sure we remove the last one return final
[docs] def parse_none(value): """ parses values looking for NANs, Nones, etc... Args: value: Value potentially containing a none or nan Returns: result: If string value is nan or none, then return None type otherwise return original value """ result = value # If its a nan or none or the string is empty if isinstance(value, str): if value.lower() in ['nan', 'none', '-9999', '-9999.0'] or not value: result = None elif isinstance(value, float) or isinstance(value, int): if np.isnan(value) or value == -9999: result = None return result
[docs] def kw_in_here(kw, d, case_sensitive=True): """ Determines if the keyword is found in any of the entries in the List If any match is found returns true Can use a list or dictionary. If a dictionary is supplied the keys will be used e.g. dielectric_constant is found in [temperature, dielectric_constant_a] Args: kw: Keyword we're searching for d: List or dictionary with keys of strings case_sensitive: Boolean indicating whether it should be case sensitive or not Returns: Bool: Indicating the keyword was found """ if isinstance(d, dict): d_keys = d.keys() else: d_keys = d if not case_sensitive: k = kw.lower() d_keys = [c.lower() for c in d_keys] else: k = kw truth = [True for c in d_keys if k in c] return len(truth) > 0
[docs] def get_alpha_ratio(str_line, encapsulator='""'): """ Calculates the ratio of characters to numbers and potentially ignore things encapsulated Args: str_line: String to evaluate encapsulator: chars that encapsulate strings to be ignored Returns: ratio: float ratio of number of letter to number of numbers """ line = str_line # Remove any quoted text if encapsulator: line = strip_encapsulated(str_line, encapsulator='""') n_alpha = len([c for c in line if c.isalpha()]) n_numeric = len([c for c in line if c.isnumeric()]) if n_numeric == 0: ratio = 1 else: ratio = n_alpha / n_numeric return ratio
[docs] def line_is_header(str_line, header_sep=',', header_indicator='#', previous_alpha_ratio=None, expected_columns=None): """ Determine is line 1 is a header line """ # Definitive indication of a header line if header_indicator: return header_indicator == str_line[0] # No immediate answer so build confidence matches = [] if previous_alpha_ratio: ratio = get_alpha_ratio(str_line) matches.append(ratio >= previous_alpha_ratio) if header_sep: line = strip_encapsulated(str_line, encapsulator='()') matches.append(len(line.split(header_sep)) == expected_columns) return matches.count(True) > matches.count(False)