atlus

Convert raw address and phone number strings into the OSM format.

atlus is a Python package to convert raw address and phone number strings into the OSM format. It's designed to be used with US and Canadian phone numbers and addresses.

>>> import atlus
>>> atlus.abbrs("St. Francis")
"Saint Francis"
>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0]
{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville",
    "addr:state": "CA", "addr:postcode": "98765"}
>>> atlus.get_phone("(202) 900-9019")
"+1-202-900-9019"
 1"""Convert raw address and phone number strings into the OSM format.
 2
 3`atlus` is a Python package to convert raw address and phone number strings into
 4the OSM format. It's designed to be used with US and Canadian phone numbers and
 5addresses.
 6
 7```python
 8>>> import atlus
 9>>> atlus.abbrs("St. Francis")
10"Saint Francis"
11>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0]
12{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville",
13    "addr:state": "CA", "addr:postcode": "98765"}
14>>> atlus.get_phone("(202) 900-9019")
15"+1-202-900-9019"
16```
17
18"""
19
20# SPDX-FileCopyrightText: 2024-present Will <wahubsch@gmail.com>
21#
22# SPDX-License-Identifier: MIT
23
24from . import atlus, resources
25from .atlus import (
26    abbrs,
27    get_address,
28    get_phone,
29    get_title,
30    mc_replace,
31    ord_replace,
32    remove_br_unicode,
33    us_replace,
34)
35
36__all__ = [
37    "get_address",
38    "get_phone",
39    "abbrs",
40    "get_title",
41    "mc_replace",
42    "us_replace",
43    "ord_replace",
44    "remove_br_unicode",
45    "atlus",
46    "resources",
47]
def get_address(address_string: str) -> tuple[dict[str, str], list[str | None]]:
392def get_address(address_string: str) -> tuple[dict[str, str], list[str | None]]:
393    """Process address strings.
394
395    ```python
396    >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
397    {"addr:housenumber": "345", "addr:street": "Maple Road",
398    "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
399    >>> get_address("777 Strawberry St.")[0]
400    {"addr:housenumber": "777", "addr:street": "Strawberry Street"}
401    >>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
402    >>> address[0]
403    {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
404    >>> address[1]
405    ["addr:unit"]
406    ```
407
408    Args:
409        address_string (str): The address string to process.
410
411    Returns:
412        tuple[dict[str, str], list[str | None]]:
413        The processed address string and the removed fields.
414    """
415    try:
416        cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[
417            0
418        ]
419        removed = []
420    except usaddress.RepeatedLabelError as err:
421        collapsed = collapse_list(
422            [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string]
423        )
424        cleaned, removed = manual_join(_combine_consecutive_tuples(collapsed))
425
426    for toss in toss_tags:
427        cleaned.pop(toss, None)
428
429    if "addr:housenumber" in cleaned:
430        cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])}
431
432    if "addr:street" in cleaned:
433        street = abbrs(cleaned["addr:street"])
434        cleaned["addr:street"] = street_comp.sub("Street", street).strip(".")
435
436    if "addr:city" in cleaned:
437        cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True))
438
439    if "addr:state" in cleaned:
440        old = cleaned["addr:state"].replace(".", "")
441        if old.upper() in state_expand:
442            cleaned["addr:state"] = state_expand[old.upper()]
443        elif len(old) == 2 and old.upper() in list(state_expand.values()):
444            cleaned["addr:state"] = old.upper()
445
446    if "addr:unit" in cleaned:
447        cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.")
448
449    if "addr:postcode" in cleaned:
450        # remove extraneous postcode digits
451        cleaned["addr:postcode"] = post_comp.sub(
452            r"\1", cleaned["addr:postcode"]
453        ).replace(" ", "-")
454
455    try:
456        validated: Address = Address.model_validate(dict(cleaned))
457    except ValidationError as err:
458        bad_fields: list = [each.get("loc", [])[0] for each in err.errors()]
459        cleaned_ret = dict(cleaned)
460        for each in bad_fields:
461            cleaned_ret.pop(each, None)
462
463        removed.extend(bad_fields)
464        validated: Address = Address.model_validate(cleaned_ret)
465
466    return validated.model_dump(exclude_none=True, by_alias=True), removed

Process address strings.

>>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
{"addr:housenumber": "345", "addr:street": "Maple Road",
"addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
>>> get_address("777 Strawberry St.")[0]
{"addr:housenumber": "777", "addr:street": "Strawberry Street"}
>>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
>>> address[0]
{"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
>>> address[1]
["addr:unit"]
Arguments:
  • address_string (str): The address string to process.
Returns:

tuple[dict[str, str], list[str | None]]: The processed address string and the removed fields.

def get_phone(phone: str) -> str:
469def get_phone(phone: str) -> str:
470    """Format phone numbers to the US and Canadian standard format of `+1-XXX-XXX-XXXX`.
471
472    ```python
473    >>> get_phone("2029009019")
474    "+1-202-900-9019"
475    >>> get_phone("(202) 900-9019")
476    "+1-202-900-9019"
477    >>> get_phone("202-900-901")
478    ValueError: Invalid phone number: 202-900-901
479    ```
480
481    Args:
482        phone (str): The phone number to format.
483
484    Returns:
485        str: The formatted phone number.
486
487    Raises:
488        ValueError: If the phone number is invalid.
489    """
490    phone_valid = regex.search(
491        r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", phone
492    )
493    if phone_valid:
494        return (
495            f"+1-{phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}"
496        )
497    raise ValueError(f"Invalid phone number: {phone}")

Format phone numbers to the US and Canadian standard format of +1-XXX-XXX-XXXX.

>>> get_phone("2029009019")
"+1-202-900-9019"
>>> get_phone("(202) 900-9019")
"+1-202-900-9019"
>>> get_phone("202-900-901")
ValueError: Invalid phone number: 202-900-901
Arguments:
  • phone (str): The phone number to format.
Returns:

str: The formatted phone number.

Raises:
  • ValueError: If the phone number is invalid.
def abbrs(value: str) -> str:
196def abbrs(value: str) -> str:
197    """Bundle most common abbreviation expansion functions.
198
199    ```python
200    >>> abbrs("St. Francis")
201    "Saint Francis"
202    >>> abbrs("E St.")
203    "E Street"
204    >>> abbrs("E Sewell St")
205    "East Sewell Street"
206    ```
207
208    Args:
209        value (str): String to expand.
210
211    Returns:
212        str: Expanded string.
213    """
214    value = ord_replace(us_replace(mc_replace(get_title(value))))
215
216    # change likely 'St' to 'Saint'
217    value = saint_comp.sub("Saint", value)
218
219    # expand common street and word abbreviations
220    value = abbr_join_comp.sub(name_street_expand, value)
221
222    # expand directionals
223    value = dir_fill_comp.sub(direct_expand, value)
224
225    # normalize 'US'
226    value = us_replace(value)
227
228    # uppercase shortened street descriptors
229    value = regex.sub(r"\b(C[rh]|S[rh]|[FR]m|Us)\b", cap_match, value)
230
231    # remove unremoved abbr periods
232    value = regex.sub(r"([a-zA-Z]{2,})\.", r"\1", value)
233
234    # expand 'SR' if no other street types
235    value = sr_comp.sub("State Route", value)
236    return value.strip(" .")

Bundle most common abbreviation expansion functions.

>>> abbrs("St. Francis")
"Saint Francis"
>>> abbrs("E St.")
"E Street"
>>> abbrs("E Sewell St")
"East Sewell Street"
Arguments:
  • value (str): String to expand.
Returns:

str: Expanded string.

def get_title(value: str, single_word: bool = False) -> str:
58def get_title(value: str, single_word: bool = False) -> str:
59    """Fix ALL-CAPS string.
60
61    ```python
62    >>> get_title("PALM BEACH")
63    "Palm Beach"
64    >>> get_title("BOSTON")
65    "BOSTON"
66    >>> get_title("BOSTON", single_word=True)
67    "Boston"
68    ```
69
70    Args:
71        value: String to fix.
72        single_word: Whether the string should be fixed even if it is a single word.
73
74    Returns:
75        str: Fixed string.
76    """
77    if (value.isupper() and " " in value) or (value.isupper() and single_word):
78        return mc_replace(value.title())
79    return value

Fix ALL-CAPS string.

>>> get_title("PALM BEACH")
"Palm Beach"
>>> get_title("BOSTON")
"BOSTON"
>>> get_title("BOSTON", single_word=True)
"Boston"
Arguments:
  • value: String to fix.
  • single_word: Whether the string should be fixed even if it is a single word.
Returns:

str: Fixed string.

def mc_replace(value: str) -> str:
 99def mc_replace(value: str) -> str:
100    """Fix string containing improperly formatted Mc- prefix.
101
102    ```python
103    >>> mc_replace("Fort Mchenry")
104    "Fort McHenry"
105    ```
106
107    Args:
108        value: String to fix.
109
110    Returns:
111        str: Fixed string.
112    """
113    words = []
114    for word in value.split():
115        mc_match = word.partition("Mc")
116        words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize())
117    return " ".join(words)

Fix string containing improperly formatted Mc- prefix.

>>> mc_replace("Fort Mchenry")
"Fort McHenry"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def us_replace(value: str) -> str:
82def us_replace(value: str) -> str:
83    """Fix string containing improperly formatted US.
84
85    ```python
86    >>> us_replace("U.S. Route 15")
87    "US Route 15"
88    ```
89
90    Args:
91        value: String to fix.
92
93    Returns:
94        str: Fixed string.
95    """
96    return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")

Fix string containing improperly formatted US.

>>> us_replace("U.S. Route 15")
"US Route 15"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def ord_replace(value: str) -> str:
120def ord_replace(value: str) -> str:
121    """Fix string containing improperly capitalized ordinal.
122
123    ```python
124    >>> ord_replace("3Rd St. NW")
125    "3rd St. NW"
126    ```
127
128    Args:
129        value: String to fix.
130
131    Returns:
132        str: Fixed string.
133    """
134    return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)

Fix string containing improperly capitalized ordinal.

>>> ord_replace("3Rd St. NW")
"3rd St. NW"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def remove_br_unicode(old: str) -> str:
239def remove_br_unicode(old: str) -> str:
240    """Clean the input string before sending to parser by removing newlines and unicode.
241
242    Args:
243        old (str): String to clean.
244
245    Returns:
246        str: Cleaned string.
247    """
248    old = regex.sub(r"<br ?/>", ",", old)
249    return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old)  # remove unicode

Clean the input string before sending to parser by removing newlines and unicode.

Arguments:
  • old (str): String to clean.
Returns:

str: Cleaned string.