atlus
Convert raw address and phone number strings into the OSM format.
atlus is a Python package to convert raw address and phone number strings into
the OSM format. It's designed to be used with US and Canadian phone numbers and
addresses.
>>> import atlus
>>> atlus.abbrs("St. Francis")
"Saint Francis"
>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0]
{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville",
"addr:state": "CA", "addr:postcode": "98765"}
>>> atlus.get_phone("(202) 900-9019")
"+1-202-900-9019"
1"""Convert raw address and phone number strings into the OSM format. 2 3`atlus` is a Python package to convert raw address and phone number strings into 4the OSM format. It's designed to be used with US and Canadian phone numbers and 5addresses. 6 7```python 8>>> import atlus 9>>> atlus.abbrs("St. Francis") 10"Saint Francis" 11>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0] 12{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville", 13 "addr:state": "CA", "addr:postcode": "98765"} 14>>> atlus.get_phone("(202) 900-9019") 15"+1-202-900-9019" 16``` 17 18""" 19 20# SPDX-FileCopyrightText: 2024-present Will <wahubsch@gmail.com> 21# 22# SPDX-License-Identifier: MIT 23 24from . import atlus, resources 25from .atlus import ( 26 abbrs, 27 get_address, 28 get_phone, 29 get_title, 30 mc_replace, 31 ord_replace, 32 remove_br_unicode, 33 us_replace, 34) 35 36__all__ = [ 37 "get_address", 38 "get_phone", 39 "abbrs", 40 "get_title", 41 "mc_replace", 42 "us_replace", 43 "ord_replace", 44 "remove_br_unicode", 45 "atlus", 46 "resources", 47]
392def get_address(address_string: str) -> tuple[dict[str, str], list[str | None]]: 393 """Process address strings. 394 395 ```python 396 >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0] 397 {"addr:housenumber": "345", "addr:street": "Maple Road", 398 "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"} 399 >>> get_address("777 Strawberry St.")[0] 400 {"addr:housenumber": "777", "addr:street": "Strawberry Street"} 401 >>> address = get_address("222 NW Pineapple Ave Suite A Unit B") 402 >>> address[0] 403 {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"} 404 >>> address[1] 405 ["addr:unit"] 406 ``` 407 408 Args: 409 address_string (str): The address string to process. 410 411 Returns: 412 tuple[dict[str, str], list[str | None]]: 413 The processed address string and the removed fields. 414 """ 415 try: 416 cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[ 417 0 418 ] 419 removed = [] 420 except usaddress.RepeatedLabelError as err: 421 collapsed = collapse_list( 422 [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string] 423 ) 424 cleaned, removed = manual_join(_combine_consecutive_tuples(collapsed)) 425 426 for toss in toss_tags: 427 cleaned.pop(toss, None) 428 429 if "addr:housenumber" in cleaned: 430 cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])} 431 432 if "addr:street" in cleaned: 433 street = abbrs(cleaned["addr:street"]) 434 cleaned["addr:street"] = street_comp.sub("Street", street).strip(".") 435 436 if "addr:city" in cleaned: 437 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True)) 438 439 if "addr:state" in cleaned: 440 old = cleaned["addr:state"].replace(".", "") 441 if old.upper() in state_expand: 442 cleaned["addr:state"] = state_expand[old.upper()] 443 elif len(old) == 2 and old.upper() in list(state_expand.values()): 444 cleaned["addr:state"] = old.upper() 445 446 if "addr:unit" in cleaned: 447 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.") 448 449 if "addr:postcode" in cleaned: 450 # remove extraneous postcode digits 451 cleaned["addr:postcode"] = post_comp.sub( 452 r"\1", cleaned["addr:postcode"] 453 ).replace(" ", "-") 454 455 try: 456 validated: Address = Address.model_validate(dict(cleaned)) 457 except ValidationError as err: 458 bad_fields: list = [each.get("loc", [])[0] for each in err.errors()] 459 cleaned_ret = dict(cleaned) 460 for each in bad_fields: 461 cleaned_ret.pop(each, None) 462 463 removed.extend(bad_fields) 464 validated: Address = Address.model_validate(cleaned_ret) 465 466 return validated.model_dump(exclude_none=True, by_alias=True), removed
Process address strings.
>>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
{"addr:housenumber": "345", "addr:street": "Maple Road",
"addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
>>> get_address("777 Strawberry St.")[0]
{"addr:housenumber": "777", "addr:street": "Strawberry Street"}
>>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
>>> address[0]
{"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
>>> address[1]
["addr:unit"]
Arguments:
- address_string (str): The address string to process.
Returns:
tuple[dict[str, str], list[str | None]]: The processed address string and the removed fields.
469def get_phone(phone: str) -> str: 470 """Format phone numbers to the US and Canadian standard format of `+1-XXX-XXX-XXXX`. 471 472 ```python 473 >>> get_phone("2029009019") 474 "+1-202-900-9019" 475 >>> get_phone("(202) 900-9019") 476 "+1-202-900-9019" 477 >>> get_phone("202-900-901") 478 ValueError: Invalid phone number: 202-900-901 479 ``` 480 481 Args: 482 phone (str): The phone number to format. 483 484 Returns: 485 str: The formatted phone number. 486 487 Raises: 488 ValueError: If the phone number is invalid. 489 """ 490 phone_valid = regex.search( 491 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", phone 492 ) 493 if phone_valid: 494 return ( 495 f"+1-{phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}" 496 ) 497 raise ValueError(f"Invalid phone number: {phone}")
Format phone numbers to the US and Canadian standard format of +1-XXX-XXX-XXXX.
>>> get_phone("2029009019")
"+1-202-900-9019"
>>> get_phone("(202) 900-9019")
"+1-202-900-9019"
>>> get_phone("202-900-901")
ValueError: Invalid phone number: 202-900-901
Arguments:
- phone (str): The phone number to format.
Returns:
str: The formatted phone number.
Raises:
- ValueError: If the phone number is invalid.
196def abbrs(value: str) -> str: 197 """Bundle most common abbreviation expansion functions. 198 199 ```python 200 >>> abbrs("St. Francis") 201 "Saint Francis" 202 >>> abbrs("E St.") 203 "E Street" 204 >>> abbrs("E Sewell St") 205 "East Sewell Street" 206 ``` 207 208 Args: 209 value (str): String to expand. 210 211 Returns: 212 str: Expanded string. 213 """ 214 value = ord_replace(us_replace(mc_replace(get_title(value)))) 215 216 # change likely 'St' to 'Saint' 217 value = saint_comp.sub("Saint", value) 218 219 # expand common street and word abbreviations 220 value = abbr_join_comp.sub(name_street_expand, value) 221 222 # expand directionals 223 value = dir_fill_comp.sub(direct_expand, value) 224 225 # normalize 'US' 226 value = us_replace(value) 227 228 # uppercase shortened street descriptors 229 value = regex.sub(r"\b(C[rh]|S[rh]|[FR]m|Us)\b", cap_match, value) 230 231 # remove unremoved abbr periods 232 value = regex.sub(r"([a-zA-Z]{2,})\.", r"\1", value) 233 234 # expand 'SR' if no other street types 235 value = sr_comp.sub("State Route", value) 236 return value.strip(" .")
Bundle most common abbreviation expansion functions.
>>> abbrs("St. Francis")
"Saint Francis"
>>> abbrs("E St.")
"E Street"
>>> abbrs("E Sewell St")
"East Sewell Street"
Arguments:
- value (str): String to expand.
Returns:
str: Expanded string.
58def get_title(value: str, single_word: bool = False) -> str: 59 """Fix ALL-CAPS string. 60 61 ```python 62 >>> get_title("PALM BEACH") 63 "Palm Beach" 64 >>> get_title("BOSTON") 65 "BOSTON" 66 >>> get_title("BOSTON", single_word=True) 67 "Boston" 68 ``` 69 70 Args: 71 value: String to fix. 72 single_word: Whether the string should be fixed even if it is a single word. 73 74 Returns: 75 str: Fixed string. 76 """ 77 if (value.isupper() and " " in value) or (value.isupper() and single_word): 78 return mc_replace(value.title()) 79 return value
Fix ALL-CAPS string.
>>> get_title("PALM BEACH")
"Palm Beach"
>>> get_title("BOSTON")
"BOSTON"
>>> get_title("BOSTON", single_word=True)
"Boston"
Arguments:
- value: String to fix.
- single_word: Whether the string should be fixed even if it is a single word.
Returns:
str: Fixed string.
99def mc_replace(value: str) -> str: 100 """Fix string containing improperly formatted Mc- prefix. 101 102 ```python 103 >>> mc_replace("Fort Mchenry") 104 "Fort McHenry" 105 ``` 106 107 Args: 108 value: String to fix. 109 110 Returns: 111 str: Fixed string. 112 """ 113 words = [] 114 for word in value.split(): 115 mc_match = word.partition("Mc") 116 words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize()) 117 return " ".join(words)
Fix string containing improperly formatted Mc- prefix.
>>> mc_replace("Fort Mchenry")
"Fort McHenry"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
82def us_replace(value: str) -> str: 83 """Fix string containing improperly formatted US. 84 85 ```python 86 >>> us_replace("U.S. Route 15") 87 "US Route 15" 88 ``` 89 90 Args: 91 value: String to fix. 92 93 Returns: 94 str: Fixed string. 95 """ 96 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")
Fix string containing improperly formatted US.
>>> us_replace("U.S. Route 15")
"US Route 15"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
120def ord_replace(value: str) -> str: 121 """Fix string containing improperly capitalized ordinal. 122 123 ```python 124 >>> ord_replace("3Rd St. NW") 125 "3rd St. NW" 126 ``` 127 128 Args: 129 value: String to fix. 130 131 Returns: 132 str: Fixed string. 133 """ 134 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)
Fix string containing improperly capitalized ordinal.
>>> ord_replace("3Rd St. NW")
"3rd St. NW"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
239def remove_br_unicode(old: str) -> str: 240 """Clean the input string before sending to parser by removing newlines and unicode. 241 242 Args: 243 old (str): String to clean. 244 245 Returns: 246 str: Cleaned string. 247 """ 248 old = regex.sub(r"<br ?/>", ",", old) 249 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode
Clean the input string before sending to parser by removing newlines and unicode.
Arguments:
- old (str): String to clean.
Returns:
str: Cleaned string.