Source code for snowex_db.string_management
"""
Module for functions that interpret various strings encountered in files.
These functions either prep, strip, or interpret strings for headers or
the actual data to be uploaded.
"""
import numpy as np
[docs]
def clean_str(messy):
"""
Removes unwanted character in a str that we encounter alot
"""
clean = messy
# Strip of any chars that are beginning and end
for ch in [' ', '\n']:
clean = clean.strip(ch)
# Remove colons but not when its between numbers (e.g time)
if ':' in clean:
work = clean.split(' ')
result = []
for w in work:
s = w.replace(':', '')
if s.isnumeric():
result.append(w)
else:
result.append(s)
clean = ' '.join(result)
# Remove characters anywhere in string that is undesireable
for ch in ['"', "'"]:
clean = clean.replace(ch, '')
clean = clean.strip(' ')
return clean
[docs]
def standardize_key(messy):
"""
Preps a key for use in dataframe columns or dictionary. Makes everything
lowercase, removes units, replaces spaces with underscores.
Args:
messy: string to be cleaned
Returns:
clean: String minus all characters and patterns of no interest
"""
key = messy
# Remove units
for c in ['()', '[]']:
key = strip_encapsulated(key, c)
key = clean_str(key)
key = key.lower().replace(' ', '_')
key = key.lower().replace('-', '_')
# This removes csv byte order mark for files in utf-8 while were encoding with latin
key = ''.join([c for c in key if c not in ''])
return key
[docs]
def remap_data_names(original, rename_map):
"""
Remaps keys in a dictionary according to the rename dictionary. Also can be
used for lists where the entries in the list can be renamed
Args:
original: list/dictionary of names and values that may need remapping
rename_map: Dictionary mapping names (keys) {old: new}
Returns:
new: List/dictionary containing the names remapped
"""
remap_keys = rename_map.keys()
if isinstance(original, dict):
new = {}
for k, v in original.items():
if k in remap_keys:
new_k = rename_map[k]
# handle multisample names that need changing (e.g.
# dielectric_constant_a)
elif k[-2] == '_':
kw = k[0:-2]
if kw in remap_keys:
new_k = k.replace(kw, rename_map[kw])
else:
new_k = k
new[new_k] = v
elif isinstance(original, list):
new = []
for i, v in enumerate(original):
if v in remap_keys:
new.append(rename_map[v])
# Manage multi samples
elif len(v) > 2 and '_' in v:
if v[-2] == '_' and v[0:-2] in remap_keys:
new.append(v.replace(v[0:-2], rename_map[v[0:-2]]))
else:
new.append(v)
else:
new.append(v)
else:
new = original.lower()
if new in remap_keys:
new = rename_map[new]
return new
[docs]
def get_encapsulated(str_line, encapsulator):
"""
Returns items found in the encapsulator, useful for finding units
Args:
str_line: String that has encapusulated info we want removed
encapsulator: string of characters encapusulating info to be removed
Returns:
result: list of strings found inside anything between encapsulators
e.g.
line = 'density (kg/m^3), temperature (C)'
['kg/m^3', 'C'] = get_encapsulated(line, '()')
"""
result = []
if len(encapsulator) > 2:
raise ValueError('encapsulator can only be 1 or 2 chars long!')
elif len(encapsulator) == 2:
lcap = encapsulator[0]
rcap = encapsulator[1]
else:
lcap = rcap = encapsulator
# Split on the lcap
if lcap in str_line:
for i, val in enumerate(str_line.split(lcap)):
# The first one will always be before our encapsulated
if i != 0:
if lcap != rcap:
result.append(val[0:val.index(rcap)])
else:
result.append(val)
return result
[docs]
def strip_encapsulated(str_line, encapsulator):
"""
Removes from a str anything thats encapusulated by characters and the
encapsulating chars themselves
Args:
str_line: String that has encapusulated info we want removed
encapsulator: string of characters encapsulating info to be removed
Returns:
final: String without anything between encapsulators
"""
final = str_line
result = get_encapsulated(final, encapsulator)
if len(encapsulator) == 2:
lcap = encapsulator[0]
rcap = encapsulator[1]
else:
lcap = rcap = encapsulator
# Remove all the encapsulated words
for v in result:
final = final.replace(lcap + v + rcap, '')
# Make sure we remove the last one
return final
[docs]
def parse_none(value):
"""
parses values looking for NANs, Nones, etc...
Args:
value: Value potentially containing a none or nan
Returns:
result: If string value is nan or none, then return None type otherwise
return original value
"""
result = value
# If its a nan or none or the string is empty
if isinstance(value, str):
if value.lower() in ['nan', 'none', '-9999', '-9999.0'] or not value:
result = None
elif isinstance(value, float) or isinstance(value, int):
if np.isnan(value) or value == -9999:
result = None
return result
[docs]
def kw_in_here(kw, d, case_sensitive=True):
"""
Determines if the keyword is found in any of the entries in the List
If any match is found returns true
Can use a list or dictionary. If a dictionary is supplied the keys will be
used
e.g.
dielectric_constant is found in [temperature, dielectric_constant_a]
Args:
kw: Keyword we're searching for
d: List or dictionary with keys of strings
case_sensitive: Boolean indicating whether it should be case sensitive
or not
Returns:
Bool: Indicating the keyword was found
"""
if isinstance(d, dict):
d_keys = d.keys()
else:
d_keys = d
if not case_sensitive:
k = kw.lower()
d_keys = [c.lower() for c in d_keys]
else:
k = kw
truth = [True for c in d_keys if k in c]
return len(truth) > 0
[docs]
def get_alpha_ratio(str_line, encapsulator='""'):
"""
Calculates the ratio of characters to numbers and
potentially ignore things encapsulated
Args:
str_line: String to evaluate
encapsulator: chars that encapsulate strings to be ignored
Returns:
ratio: float ratio of number of letter to number of numbers
"""
line = str_line
# Remove any quoted text
if encapsulator:
line = strip_encapsulated(str_line, encapsulator='""')
n_alpha = len([c for c in line if c.isalpha()])
n_numeric = len([c for c in line if c.isnumeric()])
if n_numeric == 0:
ratio = 1
else:
ratio = n_alpha / n_numeric
return ratio