atlus

Convert raw address and phone number strings into the OSM format.

atlus is a Python package to convert raw address and phone number strings into the OSM format. It's designed to be used with US and Canadian phone numbers and addresses.

>>> import atlus
>>> abbrs("St. Francis")
"Saint Francis"
>>> get_address("789 Oak Dr, Smallville California, 98765")[0]
{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville",
    "addr:state": "CA", "addr:postcode": "98765"}
>>> get_phone("(202) 900-9019")
"+1 202-900-9019"

View Source

 1"""Convert raw address and phone number strings into the OSM format.
 2
 3`atlus` is a Python package to convert raw address and phone number strings into
 4the OSM format. It's designed to be used with US and Canadian phone numbers and
 5addresses.
 6
 7```python
 8>>> import atlus
 9>>> atlus.abbrs("St. Francis")
10"Saint Francis"
11>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0]
12{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville",
13    "addr:state": "CA", "addr:postcode": "98765"}
14>>> atlus.get_phone("(202) 900-9019")
15"+1 202-900-9019"
16```
17
18"""
19
20# SPDX-FileCopyrightText: 2024-present Will <wahubsch@gmail.com>
21#
22# SPDX-License-Identifier: MIT
23
24from . import atlus, resources
25from .atlus import (
26    abbrs,
27    get_address,
28    get_phone,
29    get_title,
30    mc_replace,
31    ord_replace,
32    remove_br_unicode,
33    us_replace,
34)
35
36__all__ = [
37    "get_address",
38    "get_phone",
39    "abbrs",
40    "get_title",
41    "mc_replace",
42    "us_replace",
43    "ord_replace",
44    "remove_br_unicode",
45    "atlus",
46    "resources",
47]

def get_address(address_string: str) -> Tuple[Dict[str, str], List[Optional[str]]]: View Source

393def get_address(address_string: str) -> Tuple[Dict[str, str], List[Union[str, None]]]:
394    """Process address strings.
395
396    ```python
397    >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
398    {"addr:housenumber": "345", "addr:street": "Maple Road",
399    "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
400    >>> get_address("777 Strawberry St.")[0]
401    {"addr:housenumber": "777", "addr:street": "Strawberry Street"}
402    >>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
403    >>> address[0]
404    {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
405    >>> address[1]
406    ["addr:unit"]
407    ```
408
409    Args:
410        address_string (str): The address string to process.
411
412    Returns:
413        Tuple[Dict[str, str], List[Union[str, None]]]:
414        The processed address string and the removed fields.
415    """
416    try:
417        cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[
418            0
419        ]
420        removed = []
421    except usaddress.RepeatedLabelError as err:
422        collapsed = collapse_list(
423            [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string]
424        )
425        cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed))
426
427    for toss in toss_tags:
428        cleaned.pop(toss, None)
429
430    if "addr:housenumber" in cleaned:
431        cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])}
432
433    if "addr:street" in cleaned:
434        street = abbrs(cleaned["addr:street"])
435        cleaned["addr:street"] = street_comp.sub("Street", street).strip(".")
436
437    if "addr:city" in cleaned:
438        cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True))
439
440    if "addr:state" in cleaned:
441        old = cleaned["addr:state"].replace(".", "")
442        if old.upper() in state_expand:
443            cleaned["addr:state"] = state_expand[old.upper()]
444        elif len(old) == 2 and old.upper() in list(state_expand.values()):
445            cleaned["addr:state"] = old.upper()
446
447    if "addr:unit" in cleaned:
448        cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.")
449
450    if "addr:postcode" in cleaned:
451        # remove extraneous postcode digits
452        cleaned["addr:postcode"] = post_comp.sub(
453            r"\1", cleaned["addr:postcode"]
454        ).replace(" ", "-")
455
456    try:
457        validated: Address = Address.model_validate(dict(cleaned))
458    except ValidationError as err:
459        bad_fields: list = [each.get("loc", [])[0] for each in err.errors()]
460        cleaned_ret = dict(cleaned)
461        for each in bad_fields:
462            cleaned_ret.pop(each, None)
463
464        removed.extend(bad_fields)
465        validated: Address = Address.model_validate(cleaned_ret)
466
467    return validated.model_dump(exclude_none=True, by_alias=True), removed

Process address strings.

>>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
{"addr:housenumber": "345", "addr:street": "Maple Road",
"addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
>>> get_address("777 Strawberry St.")[0]
{"addr:housenumber": "777", "addr:street": "Strawberry Street"}
>>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
>>> address[0]
{"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
>>> address[1]
["addr:unit"]

Arguments:

address_string (str): The address string to process.

Returns:

Tuple[Dict[str, str], List[Union[str, None]]]: The processed address string and the removed fields.

def get_phone(phone: str) -> str: View Source

470def get_phone(phone: str) -> str:
471    """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`.
472
473    ```python
474    >>> get_phone("2029009019")
475    "+1 202-900-9019"
476    >>> get_phone("(202) 900-9019")
477    "+1 202-900-9019"
478    >>> get_phone("202-900-901")
479    ValueError: Invalid phone number: 202-900-901
480    ```
481
482    Args:
483        phone (str): The phone number to format.
484
485    Returns:
486        str: The formatted phone number.
487
488    Raises:
489        ValueError: If the phone number is invalid.
490    """
491    phone_valid = regex.search(
492        r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$", phone
493    )
494    if phone_valid:
495        return (
496            f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}"
497        )
498    raise ValueError(f"Invalid phone number: {phone}")

Format phone numbers to the US and Canadian standard format of +1 XXX-XXX-XXXX.

>>> get_phone("2029009019")
"+1 202-900-9019"
>>> get_phone("(202) 900-9019")
"+1 202-900-9019"
>>> get_phone("202-900-901")
ValueError: Invalid phone number: 202-900-901

Arguments:

phone (str): The phone number to format.

Returns:

str: The formatted phone number.

Raises:

ValueError: If the phone number is invalid.

def abbrs(value: str) -> str: View Source

197def abbrs(value: str) -> str:
198    """Bundle most common abbreviation expansion functions.
199
200    ```python
201    >>> abbrs("St. Francis")
202    "Saint Francis"
203    >>> abbrs("E St.")
204    "E Street"
205    >>> abbrs("E Sewell St")
206    "East Sewell Street"
207    ```
208
209    Args:
210        value (str): String to expand.
211
212    Returns:
213        str: Expanded string.
214    """
215    value = ord_replace(us_replace(mc_replace(get_title(value))))
216
217    # change likely 'St' to 'Saint'
218    value = saint_comp.sub("Saint", value)
219
220    # expand common street and word abbreviations
221    value = abbr_join_comp.sub(name_street_expand, value)
222
223    # expand directionals
224    value = dir_fill_comp.sub(direct_expand, value)
225
226    # normalize 'US'
227    value = us_replace(value)
228
229    # uppercase shortened street descriptors
230    value = regex.sub(r"\b(C[rh]|S[rh]|[FR]m|Us)\b", cap_match, value)
231
232    # remove unremoved abbr periods
233    value = regex.sub(r"([a-zA-Z]{2,})\.", r"\1", value)
234
235    # expand 'SR' if no other street types
236    value = sr_comp.sub("State Route", value)
237    return value.strip(" .")

Bundle most common abbreviation expansion functions.

>>> abbrs("St. Francis")
"Saint Francis"
>>> abbrs("E St.")
"E Street"
>>> abbrs("E Sewell St")
"East Sewell Street"

Arguments:

value (str): String to expand.

Returns:

str: Expanded string.

def get_title(value: str, single_word: bool = False) -> str: View Source

59def get_title(value: str, single_word: bool = False) -> str:
60    """Fix ALL-CAPS string.
61
62    ```python
63    >>> get_title("PALM BEACH")
64    "Palm Beach"
65    >>> get_title("BOSTON")
66    "BOSTON"
67    >>> get_title("BOSTON", single_word=True)
68    "Boston"
69    ```
70
71    Args:
72        value: String to fix.
73        single_word: Whether the string should be fixed even if it is a single word.
74
75    Returns:
76        str: Fixed string.
77    """
78    if (value.isupper() and " " in value) or (value.isupper() and single_word):
79        return mc_replace(value.title())
80    return value

Fix ALL-CAPS string.

>>> get_title("PALM BEACH")
"Palm Beach"
>>> get_title("BOSTON")
"BOSTON"
>>> get_title("BOSTON", single_word=True)
"Boston"

Arguments:

value: String to fix.
single_word: Whether the string should be fixed even if it is a single word.

Returns:

str: Fixed string.

def mc_replace(value: str) -> str: View Source

100def mc_replace(value: str) -> str:
101    """Fix string containing improperly formatted Mc- prefix.
102
103    ```python
104    >>> mc_replace("Fort Mchenry")
105    "Fort McHenry"
106    ```
107
108    Args:
109        value: String to fix.
110
111    Returns:
112        str: Fixed string.
113    """
114    words = []
115    for word in value.split():
116        mc_match = word.partition("Mc")
117        words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize())
118    return " ".join(words)

Fix string containing improperly formatted Mc- prefix.

>>> mc_replace("Fort Mchenry")
"Fort McHenry"

Arguments:

value: String to fix.

Returns:

str: Fixed string.

def us_replace(value: str) -> str: View Source

83def us_replace(value: str) -> str:
84    """Fix string containing improperly formatted US.
85
86    ```python
87    >>> us_replace("U.S. Route 15")
88    "US Route 15"
89    ```
90
91    Args:
92        value: String to fix.
93
94    Returns:
95        str: Fixed string.
96    """
97    return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")

Fix string containing improperly formatted US.

>>> us_replace("U.S. Route 15")
"US Route 15"

Arguments:

value: String to fix.

Returns:

str: Fixed string.

def ord_replace(value: str) -> str: View Source

121def ord_replace(value: str) -> str:
122    """Fix string containing improperly capitalized ordinal.
123
124    ```python
125    >>> ord_replace("3Rd St. NW")
126    "3rd St. NW"
127    ```
128
129    Args:
130        value: String to fix.
131
132    Returns:
133        str: Fixed string.
134    """
135    return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)

Fix string containing improperly capitalized ordinal.

>>> ord_replace("3Rd St. NW")
"3rd St. NW"

Arguments:

value: String to fix.

Returns:

str: Fixed string.

def remove_br_unicode(old: str) -> str: View Source

240def remove_br_unicode(old: str) -> str:
241    """Clean the input string before sending to parser by removing newlines and unicode.
242
243    Args:
244        old (str): String to clean.
245
246    Returns:
247        str: Cleaned string.
248    """
249    old = regex.sub(r"<br ?/>", ",", old)
250    return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old)  # remove unicode

Clean the input string before sending to parser by removing newlines and unicode.

Arguments:

old (str): String to clean.

Returns:

str: Cleaned string.