atlus
atlus
is a Python package to convert raw address and phone number strings into the OSM format.
It's designed to be used with US and Canadian phone numbers and addresses.
>>> import atlus
>>> atlus.abbrs("St. Francis")
"Saint Francis"
>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0]
{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville", "addr:state": "CA", "addr:postcode": "98765"}
>>> atlus.get_phone("(202) 900-9019")
"+1 202-900-9019"
1"""`atlus` is a Python package to convert raw address and phone number strings into the OSM format. 2It's designed to be used with US and Canadian phone numbers and addresses. 3 4```python 5>>> import atlus 6>>> atlus.abbrs("St. Francis") 7"Saint Francis" 8>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0] 9{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville", "addr:state": "CA", "addr:postcode": "98765"} 10>>> atlus.get_phone("(202) 900-9019") 11"+1 202-900-9019" 12``` 13 14""" 15 16# SPDX-FileCopyrightText: 2024-present Will <wahubsch@gmail.com> 17# 18# SPDX-License-Identifier: MIT 19 20from .atlus import ( 21 get_address, 22 get_phone, 23 abbrs, 24 get_title, 25 mc_replace, 26 us_replace, 27 ord_replace, 28 remove_br_unicode, 29) 30from . import atlus 31from . import resources 32 33__all__ = [ 34 "get_address", 35 "get_phone", 36 "abbrs", 37 "get_title", 38 "mc_replace", 39 "us_replace", 40 "ord_replace", 41 "remove_br_unicode", 42 "atlus", 43 "resources", 44]
409def get_address( 410 address_string: str, 411) -> Tuple[Dict[str, str], List[Union[str, None]]]: 412 """Process address strings. 413 414 ```python 415 >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0] 416 {"addr:housenumber": "345", "addr:street": "Maple Road", 417 "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"} 418 >>> get_address("777 Strawberry St.")[0] 419 {"addr:housenumber": "777", "addr:street": "Strawberry Street"} 420 >>> address = get_address("222 NW Pineapple Ave Suite A Unit B") 421 >>> address[0] 422 {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"} 423 >>> address[1] 424 ["addr:unit"] 425 ``` 426 427 Args: 428 address_string (str): The address string to process. 429 430 Returns: 431 Tuple[Dict[str, str], List[Union[str, None]]]: 432 The processed address string and the removed fields. 433 """ 434 try: 435 cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[ 436 0 437 ] 438 removed = [] 439 except usaddress.RepeatedLabelError as err: 440 collapsed = collapse_list( 441 [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string] 442 ) 443 cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed)) 444 445 for toss in toss_tags: 446 cleaned.pop(toss, None) 447 448 if "addr:housenumber" in cleaned: 449 cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])} 450 451 if "addr:street" in cleaned: 452 street = abbrs(cleaned["addr:street"]) 453 cleaned["addr:street"] = street_comp.sub( 454 "Street", 455 street, 456 ).strip(".") 457 458 if "addr:city" in cleaned: 459 cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True)) 460 461 if "addr:state" in cleaned: 462 old = cleaned["addr:state"].replace(".", "") 463 if old.upper() in state_expand: 464 cleaned["addr:state"] = state_expand[old.upper()] 465 elif len(old) == 2 and old.upper() in list(state_expand.values()): 466 cleaned["addr:state"] = old.upper() 467 468 if "addr:unit" in cleaned: 469 cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.") 470 471 if "addr:postcode" in cleaned: 472 # remove extraneous postcode digits 473 cleaned["addr:postcode"] = post_comp.sub( 474 r"\1", cleaned["addr:postcode"] 475 ).replace(" ", "-") 476 477 try: 478 validated: Address = Address.model_validate(dict(cleaned)) 479 except ValidationError as err: 480 bad_fields: list = [each.get("loc", [])[0] for each in err.errors()] 481 cleaned_ret = dict(cleaned) 482 for each in bad_fields: 483 cleaned_ret.pop(each, None) 484 485 removed.extend(bad_fields) 486 validated: Address = Address.model_validate(cleaned_ret) 487 488 return validated.model_dump(exclude_none=True, by_alias=True), removed
Process address strings.
>>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
{"addr:housenumber": "345", "addr:street": "Maple Road",
"addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
>>> get_address("777 Strawberry St.")[0]
{"addr:housenumber": "777", "addr:street": "Strawberry Street"}
>>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
>>> address[0]
{"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
>>> address[1]
["addr:unit"]
Arguments:
- address_string (str): The address string to process.
Returns:
Tuple[Dict[str, str], List[Union[str, None]]]: The processed address string and the removed fields.
491def get_phone(phone: str) -> str: 492 """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`. 493 494 ```python 495 >>> get_phone("2029009019") 496 "+1 202-900-9019" 497 >>> get_phone("(202) 900-9019") 498 "+1 202-900-9019" 499 >>> get_phone("202-900-901") 500 ValueError: Invalid phone number: 202-900-901 501 ``` 502 503 Args: 504 phone (str): The phone number to format. 505 506 Returns: 507 str: The formatted phone number. 508 509 Raises: 510 ValueError: If the phone number is invalid. 511 """ 512 phone_valid = regex.search( 513 r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", 514 phone, 515 ) 516 if phone_valid: 517 return ( 518 f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}" 519 ) 520 raise ValueError(f"Invalid phone number: {phone}")
Format phone numbers to the US and Canadian standard format of +1 XXX-XXX-XXXX
.
>>> get_phone("2029009019")
"+1 202-900-9019"
>>> get_phone("(202) 900-9019")
"+1 202-900-9019"
>>> get_phone("202-900-901")
ValueError: Invalid phone number: 202-900-901
Arguments:
- phone (str): The phone number to format.
Returns:
str: The formatted phone number.
Raises:
- ValueError: If the phone number is invalid.
196def abbrs(value: str) -> str: 197 """Bundle most common abbreviation expansion functions. 198 199 ```python 200 >>> abbrs("St. Francis") 201 "Saint Francis" 202 >>> abbrs("E St.") 203 "E Street" 204 >>> abbrs("E Sewell St") 205 "East Sewell Street" 206 ``` 207 208 Args: 209 value (str): String to expand. 210 211 Returns: 212 str: Expanded string. 213 """ 214 value = ord_replace(us_replace(mc_replace(get_title(value)))) 215 216 # change likely 'St' to 'Saint' 217 value = saint_comp.sub( 218 "Saint", 219 value, 220 ) 221 222 # expand common street and word abbreviations 223 value = abbr_join_comp.sub( 224 name_street_expand, 225 value, 226 ) 227 228 # expand directionals 229 value = dir_fill_comp.sub( 230 direct_expand, 231 value, 232 ) 233 234 # normalize 'US' 235 value = us_replace(value) 236 237 # uppercase shortened street descriptors 238 value = regex.sub( 239 r"\b(C[rh]|S[rh]|[FR]m|Us)\b", 240 cap_match, 241 value, 242 ) 243 244 # remove unremoved abbr periods 245 value = regex.sub( 246 r"([a-zA-Z]{2,})\.", 247 r"\1", 248 value, 249 ) 250 251 # expand 'SR' if no other street types 252 value = sr_comp.sub("State Route", value) 253 return value.strip(" .")
Bundle most common abbreviation expansion functions.
>>> abbrs("St. Francis")
"Saint Francis"
>>> abbrs("E St.")
"E Street"
>>> abbrs("E Sewell St")
"East Sewell Street"
Arguments:
- value (str): String to expand.
Returns:
str: Expanded string.
58def get_title(value: str, single_word: bool = False) -> str: 59 """Fix ALL-CAPS string. 60 61 ```python 62 >>> get_title("PALM BEACH") 63 "Palm Beach" 64 >>> get_title("BOSTON") 65 "BOSTON" 66 >>> get_title("BOSTON", single_word=True) 67 "Boston" 68 ``` 69 70 Args: 71 value: String to fix. 72 single_word: Whether the string should be fixed even if it is a single word. 73 74 Returns: 75 str: Fixed string. 76 """ 77 if (value.isupper() and " " in value) or (value.isupper() and single_word): 78 return mc_replace(value.title()) 79 return value
Fix ALL-CAPS string.
>>> get_title("PALM BEACH")
"Palm Beach"
>>> get_title("BOSTON")
"BOSTON"
>>> get_title("BOSTON", single_word=True)
"Boston"
Arguments:
- value: String to fix.
- single_word: Whether the string should be fixed even if it is a single word.
Returns:
str: Fixed string.
99def mc_replace(value: str) -> str: 100 """Fix string containing improperly formatted Mc- prefix. 101 102 ```python 103 >>> mc_replace("Fort Mchenry") 104 "Fort McHenry" 105 ``` 106 107 Args: 108 value: String to fix. 109 110 Returns: 111 str: Fixed string. 112 """ 113 words = [] 114 for word in value.split(): 115 mc_match = word.partition("Mc") 116 words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize()) 117 return " ".join(words)
Fix string containing improperly formatted Mc- prefix.
>>> mc_replace("Fort Mchenry")
"Fort McHenry"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
82def us_replace(value: str) -> str: 83 """Fix string containing improperly formatted US. 84 85 ```python 86 >>> us_replace("U.S. Route 15") 87 "US Route 15" 88 ``` 89 90 Args: 91 value: String to fix. 92 93 Returns: 94 str: Fixed string. 95 """ 96 return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")
Fix string containing improperly formatted US.
>>> us_replace("U.S. Route 15")
"US Route 15"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
120def ord_replace(value: str) -> str: 121 """Fix string containing improperly capitalized ordinal. 122 123 ```python 124 >>> ord_replace("3Rd St. NW") 125 "3rd St. NW" 126 ``` 127 128 Args: 129 value: String to fix. 130 131 Returns: 132 str: Fixed string. 133 """ 134 return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)
Fix string containing improperly capitalized ordinal.
>>> ord_replace("3Rd St. NW")
"3rd St. NW"
Arguments:
- value: String to fix.
Returns:
str: Fixed string.
256def remove_br_unicode(old: str) -> str: 257 """Clean the input string before sending to parser by removing newlines and unicode. 258 259 Args: 260 old (str): String to clean. 261 262 Returns: 263 str: Cleaned string. 264 """ 265 old = regex.sub(r"<br ?/>", ",", old) 266 return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old) # remove unicode
Clean the input string before sending to parser by removing newlines and unicode.
Arguments:
- old (str): String to clean.
Returns:
str: Cleaned string.