atlus
Convert raw address and phone number strings into the OSM format.
atlus
is a Python package to convert raw address and phone number strings into
the OSM format. It's designed to be used with US and Canadian phone numbers and
addresses.
>>> import atlus
>>> abbrs("St. Francis")
"Saint Francis"
>>> get_address("789 Oak Dr, Smallville California, 98765")[0]
{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville",
"addr:state": "CA", "addr:postcode": "98765"}
>>> get_phone("(202) 900-9019")
"+1 202-900-9019"
1"""Convert raw address and phone number strings into the OSM format. 2 3`atlus` is a Python package to convert raw address and phone number strings into 4the OSM format. It's designed to be used with US and Canadian phone numbers and 5addresses. 6 7```python 8>>> import atlus 9>>> atlus.abbrs("St. Francis") 10"Saint Francis" 11>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0] 12{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville", 13 "addr:state": "CA", "addr:postcode": "98765"} 14>>> atlus.get_phone("(202) 900-9019") 15"+1 202-900-9019" 16``` 17 18""" 19 20# SPDX-FileCopyrightText: 2024-present Will <wahubsch@gmail.com> 21# 22# SPDX-License-Identifier: MIT 23 24from . import atlus, resources 25from .atlus import ( 26 abbrs, 27 get_address, 28 get_phone, 29 get_title, 30 mc_replace, 31 ord_replace, 32 remove_br_unicode, 33 us_replace, 34) 35 36__all__ = [ 37 "get_address", 38 "get_phone", 39 "abbrs", 40 "get_title", 41 "mc_replace", 42 "us_replace", 43 "ord_replace", 44 "remove_br_unicode", 45 "atlus", 46 "resources", 47]
393def get_address(address_string: str) -> Tuple[Dict[str, str], List[Union[str, None]]]: 394 """Process address strings. 395 396 ```python 397 >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0] 398 {"addr:housenumber": "345", "addr:street": "Maple Road", 399 "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"} 400 >>> get_address("777 Strawberry St.")[0] 401 {"addr:housenumber": "777", "addr:street": "Strawberry Street"} 402 >>> address = get_address("222 NW Pineapple Ave Suite A Unit B") 403 >>> address[0] 404 {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"} 405 >>> address[1] 406 ["addr:unit"] 407 ``` 408 409 Args: 410 address_string (str): The address string to process. 411 412 Returns: 413 Tuple[Dict[str, str], List[Union[str, None]]]: 414 The processed address string and the removed fields. 415 """ 416 try: 417 cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[ 418 0 419 ] 420 removed = [] 421 except usaddress.RepeatedLabelError as err: 422 collapsed = collapse_list( 423 [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string] 424 ) 425 cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed)) 426 427 for toss in toss_tags: 428 cleaned.pop(toss, None) 429 430 if "addr:housenumber" in cleaned: 431 cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])} 432 433 if "addr:street" in cleaned: 434 street = abbrs(cleaned["addr:street"]) 435 cleaned["addr:street"] = street_comp.sub("Street", street).strip(".") 436 437 if "addr:city" in cleaned: 438 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True)) 439 440 if "addr:state" in cleaned: 441 old = cleaned["addr:state"].replace(".", "") 442 if old.upper() in state_expand: 443 cleaned["addr:state"] = state_expand[old.upper()] 444 elif len(old) == 2 and old.upper() in list(state_expand.values()): 445 cleaned["addr:state"] = old.upper() 446 447 if "addr:unit" in cleaned: 448 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.") 449 450 if "addr:postcode" in cleaned: 451 # remove extraneous postcode digits 452 cleaned["addr:postcode"] = post_comp.sub( 453 r"\1", cleaned["addr:postcode"] 454 ).replace(" ", "-") 455 456 try: 457 validated: Address = Address.model_validate(dict(cleaned)) 458 except ValidationError as err: 459 bad_fields: list = [each.get("loc", [])[0] for each in err.errors()] 460 cleaned_ret = dict(cleaned) 461 for each in bad_fields: 462 cleaned_ret.pop(each, None) 463 464 removed.extend(bad_fields) 465 validated: Address = Address.model_validate(cleaned_ret) 466 467 return validated.model_dump(exclude_none=True, by_alias=True), removed
Process address strings.
>>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
{"addr:housenumber": "345", "addr:street": "Maple Road",
"addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
>>> get_address("777 Strawberry St.")[0]
{"addr:housenumber": "777", "addr:street": "Strawberry Street"}
>>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
>>> address[0]
{"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
>>> address[1]
["addr:unit"]
Arguments:
- address_string (str): The address string to process.
Returns:
Tuple[Dict[str, str], List[Union[str, None]]]: The processed address string and the removed fields.
470def get_phone(phone: str) -> str: 471 """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`. 472 473 ```python 474 >>> get_phone("2029009019") 475 "+1 202-900-9019" 476 >>> get_phone("(202) 900-9019") 477 "+1 202-900-9019" 478 >>> get_phone("202-900-901") 479 ValueError: Invalid phone number: 202-900-901 480 ``` 481 482 Args: 483 phone (str): The phone number to format. 484 485 Returns: 486 str: The formatted phone number. 487 488 Raises: 489 ValueError: If the phone number is invalid. 490 """ 491 phone_valid = regex.search( 492 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", phone 493 ) 494 if phone_valid: 495 return ( 496 f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}" 497 ) 498 raise ValueError(f"Invalid phone number: {phone}")
Format phone numbers to the US and Canadian standard format of +1 XXX-XXX-XXXX
.
>>> get_phone("2029009019")
"+1 202-900-9019"
>>> get_phone("(202) 900-9019")
"+1 202-900-9019"
>>> get_phone("202-900-901")
ValueError: Invalid phone number: 202-900-901
Arguments:
- phone (str): The phone number to format.
Returns:
str: The formatted phone number.
Raises:
- ValueError: If the phone number is invalid.
197def abbrs(value: str) -> str: 198 """Bundle most common abbreviation expansion functions. 199 200 ```python 201 >>> abbrs("St. Francis") 202 "Saint Francis" 203 >>> abbrs("E St.") 204 "E Street" 205 >>> abbrs("E Sewell St") 206 "East Sewell Street" 207 ``` 208 209 Args: 210 value (str): String to expand. 211 212 Returns: 213 str: Expanded string. 214 """ 215 value = ord_replace(us_replace(mc_replace(get_title(value)))) 216 217 # change likely 'St' to 'Saint' 218 value = saint_comp.sub("Saint", value) 219 220 # expand common street and word abbreviations 221 value = abbr_join_comp.sub(name_street_expand, value) 222 223 # expand directionals 224 value = dir_fill_comp.sub(direct_expand, value) 225 226 # normalize 'US' 227 value = us_replace(value) 228 229 # uppercase shortened street descriptors 230 value = regex.sub(r"\b(C[rh]|S[rh]|[FR]m|Us)\b", cap_match, value) 231 232 # remove unremoved abbr periods 233 value = regex.sub(r"([a-zA-Z]{2,})\.", r"\1", value) 234 235 # expand 'SR' if no other street types 236 value = sr_comp.sub("State Route", value) 237 return value.strip(" .")
Bundle most common abbreviation expansion functions.
>>> abbrs("St. Francis")
"Saint Francis"
>>> abbrs("E St.")
"E Street"
>>> abbrs("E Sewell St")
"East Sewell Street"
Arguments:
- value (str): String to expand.
Returns:
str: Expanded string.
59def get_title(value: str, single_word: bool = False) -> str: 60 """Fix ALL-CAPS string. 61 62 ```python 63 >>> get_title("PALM BEACH") 64 "Palm Beach" 65 >>> get_title("BOSTON") 66 "BOSTON" 67 >>> get_title("BOSTON", single_word=True) 68 "Boston" 69 ``` 70 71 Args: 72 value: String to fix. 73 single_word: Whether the string should be fixed even if it is a single word. 74 75 Returns: 76 str: Fixed string. 77 """ 78 if (value.isupper() and " " in value) or (value.isupper() and single_word): 79 return mc_replace(value.title()) 80 return value
Fix ALL-CAPS string.
>>> get_title("PALM BEACH")
"Palm Beach"
>>> get_title("BOSTON")
"BOSTON"
>>> get_title("BOSTON", single_word=True)
"Boston"
Arguments:
- value: String to fix.
- single_word: Whether the string should be fixed even if it is a single word.
Returns:
str: Fixed string.
100def mc_replace(value: str) -> str: 101 """Fix string containing improperly formatted Mc- prefix. 102 103 ```python 104 >>> mc_replace("Fort Mchenry") 105 "Fort McHenry" 106 ``` 107 108 Args: 109 value: String to fix. 110 111 Returns: 112 str: Fixed string. 113 """ 114 words = [] 115 for word in value.split(): 116 mc_match = word.partition("Mc") 117 words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize()) 118 return " ".join(words)
Fix string containing improperly formatted Mc- prefix.
>>> mc_replace("Fort Mchenry")
"Fort McHenry"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
83def us_replace(value: str) -> str: 84 """Fix string containing improperly formatted US. 85 86 ```python 87 >>> us_replace("U.S. Route 15") 88 "US Route 15" 89 ``` 90 91 Args: 92 value: String to fix. 93 94 Returns: 95 str: Fixed string. 96 """ 97 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")
Fix string containing improperly formatted US.
>>> us_replace("U.S. Route 15")
"US Route 15"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
121def ord_replace(value: str) -> str: 122 """Fix string containing improperly capitalized ordinal. 123 124 ```python 125 >>> ord_replace("3Rd St. NW") 126 "3rd St. NW" 127 ``` 128 129 Args: 130 value: String to fix. 131 132 Returns: 133 str: Fixed string. 134 """ 135 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)
Fix string containing improperly capitalized ordinal.
>>> ord_replace("3Rd St. NW")
"3rd St. NW"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
240def remove_br_unicode(old: str) -> str: 241 """Clean the input string before sending to parser by removing newlines and unicode. 242 243 Args: 244 old (str): String to clean. 245 246 Returns: 247 str: Cleaned string. 248 """ 249 old = regex.sub(r"<br ?/>", ",", old) 250 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode
Clean the input string before sending to parser by removing newlines and unicode.
Arguments:
- old (str): String to clean.
Returns:
str: Cleaned string.