atlus

atlus is a Python package to convert raw address and phone number strings into the OSM format. It's designed to be used with US and Canadian phone numbers and addresses.

>>> import atlus
>>> atlus.abbrs("St. Francis")
"Saint Francis"
>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0]
{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville", "addr:state": "CA", "addr:postcode": "98765"}
>>> atlus.get_phone("(202) 900-9019")
"+1 202-900-9019"
 1"""`atlus` is a Python package to convert raw address and phone number strings into the OSM format.
 2It's designed to be used with US and Canadian phone numbers and addresses.
 3
 4```python
 5>>> import atlus
 6>>> atlus.abbrs("St. Francis")
 7"Saint Francis"
 8>>> atlus.get_address("789 Oak Dr, Smallville California, 98765")[0]
 9{"addr:housenumber": "789", "addr:street": "Oak Drive", "addr:city": "Smallville", "addr:state": "CA", "addr:postcode": "98765"}
10>>> atlus.get_phone("(202) 900-9019")
11"+1 202-900-9019"
12```
13
14"""
15
16# SPDX-FileCopyrightText: 2024-present Will <wahubsch@gmail.com>
17#
18# SPDX-License-Identifier: MIT
19
20from .atlus import (
21    get_address,
22    get_phone,
23    abbrs,
24    get_title,
25    mc_replace,
26    us_replace,
27    ord_replace,
28    remove_br_unicode,
29)
30from . import atlus
31from . import resources
32
33__all__ = [
34    "get_address",
35    "get_phone",
36    "abbrs",
37    "get_title",
38    "mc_replace",
39    "us_replace",
40    "ord_replace",
41    "remove_br_unicode",
42    "atlus",
43    "resources",
44]
def get_address(address_string: str) -> Tuple[Dict[str, str], List[Optional[str]]]:
409def get_address(
410    address_string: str,
411) -> Tuple[Dict[str, str], List[Union[str, None]]]:
412    """Process address strings.
413
414    ```python
415    >>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
416    {"addr:housenumber": "345", "addr:street": "Maple Road",
417    "addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
418    >>> get_address("777 Strawberry St.")[0]
419    {"addr:housenumber": "777", "addr:street": "Strawberry Street"}
420    >>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
421    >>> address[0]
422    {"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
423    >>> address[1]
424    ["addr:unit"]
425    ```
426
427    Args:
428        address_string (str): The address string to process.
429
430    Returns:
431        Tuple[Dict[str, str], List[Union[str, None]]]:
432        The processed address string and the removed fields.
433    """
434    try:
435        cleaned = usaddress.tag(clean_address(address_string), tag_mapping=osm_mapping)[
436            0
437        ]
438        removed = []
439    except usaddress.RepeatedLabelError as err:
440        collapsed = collapse_list(
441            [(i[0].strip(" .,#"), i[1]) for i in err.parsed_string]
442        )
443        cleaned, removed = _manual_join(_combine_consecutive_tuples(collapsed))
444
445    for toss in toss_tags:
446        cleaned.pop(toss, None)
447
448    if "addr:housenumber" in cleaned:
449        cleaned = {**cleaned, **split_unit(cleaned["addr:housenumber"])}
450
451    if "addr:street" in cleaned:
452        street = abbrs(cleaned["addr:street"])
453        cleaned["addr:street"] = street_comp.sub(
454            "Street",
455            street,
456        ).strip(".")
457
458    if "addr:city" in cleaned:
459        cleaned["addr:city"] = abbrs(get_title(cleaned["addr:city"], single_word=True))
460
461    if "addr:state" in cleaned:
462        old = cleaned["addr:state"].replace(".", "")
463        if old.upper() in state_expand:
464            cleaned["addr:state"] = state_expand[old.upper()]
465        elif len(old) == 2 and old.upper() in list(state_expand.values()):
466            cleaned["addr:state"] = old.upper()
467
468    if "addr:unit" in cleaned:
469        cleaned["addr:unit"] = cleaned["addr:unit"].removeprefix("Space").strip(" #.")
470
471    if "addr:postcode" in cleaned:
472        # remove extraneous postcode digits
473        cleaned["addr:postcode"] = post_comp.sub(
474            r"\1", cleaned["addr:postcode"]
475        ).replace(" ", "-")
476
477    try:
478        validated: Address = Address.model_validate(dict(cleaned))
479    except ValidationError as err:
480        bad_fields: list = [each.get("loc", [])[0] for each in err.errors()]
481        cleaned_ret = dict(cleaned)
482        for each in bad_fields:
483            cleaned_ret.pop(each, None)
484
485        removed.extend(bad_fields)
486        validated: Address = Address.model_validate(cleaned_ret)
487
488    return validated.model_dump(exclude_none=True, by_alias=True), removed

Process address strings.

>>> get_address("345 MAPLE RD, COUNTRYSIDE, PA 24680-0198")[0]
{"addr:housenumber": "345", "addr:street": "Maple Road",
"addr:city": "Countryside", "addr:state": "PA", "addr:postcode": "24680-0198"}
>>> get_address("777 Strawberry St.")[0]
{"addr:housenumber": "777", "addr:street": "Strawberry Street"}
>>> address = get_address("222 NW Pineapple Ave Suite A Unit B")
>>> address[0]
{"addr:housenumber": "222", "addr:street": "Northwest Pineapple Avenue"}
>>> address[1]
["addr:unit"]
Arguments:
  • address_string (str): The address string to process.
Returns:

Tuple[Dict[str, str], List[Union[str, None]]]: The processed address string and the removed fields.

def get_phone(phone: str) -> str:
491def get_phone(phone: str) -> str:
492    """Format phone numbers to the US and Canadian standard format of `+1 XXX-XXX-XXXX`.
493
494    ```python
495    >>> get_phone("2029009019")
496    "+1 202-900-9019"
497    >>> get_phone("(202) 900-9019")
498    "+1 202-900-9019"
499    >>> get_phone("202-900-901")
500    ValueError: Invalid phone number: 202-900-901
501    ```
502
503    Args:
504        phone (str): The phone number to format.
505
506    Returns:
507        str: The formatted phone number.
508
509    Raises:
510        ValueError: If the phone number is invalid.
511    """
512    phone_valid = regex.search(
513        r"^\(?(?:\+? ?1?[ -.]*)?(?:\(?(\d{3})\)?[ -.]*)(\d{3})[ -.]*(\d{4})$",
514        phone,
515    )
516    if phone_valid:
517        return (
518            f"+1 {phone_valid.group(1)}-{phone_valid.group(2)}-{phone_valid.group(3)}"
519        )
520    raise ValueError(f"Invalid phone number: {phone}")

Format phone numbers to the US and Canadian standard format of +1 XXX-XXX-XXXX.

>>> get_phone("2029009019")
"+1 202-900-9019"
>>> get_phone("(202) 900-9019")
"+1 202-900-9019"
>>> get_phone("202-900-901")
ValueError: Invalid phone number: 202-900-901
Arguments:
  • phone (str): The phone number to format.
Returns:

str: The formatted phone number.

Raises:
  • ValueError: If the phone number is invalid.
def abbrs(value: str) -> str:
196def abbrs(value: str) -> str:
197    """Bundle most common abbreviation expansion functions.
198
199    ```python
200    >>> abbrs("St. Francis")
201    "Saint Francis"
202    >>> abbrs("E St.")
203    "E Street"
204    >>> abbrs("E Sewell St")
205    "East Sewell Street"
206    ```
207
208    Args:
209        value (str): String to expand.
210
211    Returns:
212        str: Expanded string.
213    """
214    value = ord_replace(us_replace(mc_replace(get_title(value))))
215
216    # change likely 'St' to 'Saint'
217    value = saint_comp.sub(
218        "Saint",
219        value,
220    )
221
222    # expand common street and word abbreviations
223    value = abbr_join_comp.sub(
224        name_street_expand,
225        value,
226    )
227
228    # expand directionals
229    value = dir_fill_comp.sub(
230        direct_expand,
231        value,
232    )
233
234    # normalize 'US'
235    value = us_replace(value)
236
237    # uppercase shortened street descriptors
238    value = regex.sub(
239        r"\b(C[rh]|S[rh]|[FR]m|Us)\b",
240        cap_match,
241        value,
242    )
243
244    # remove unremoved abbr periods
245    value = regex.sub(
246        r"([a-zA-Z]{2,})\.",
247        r"\1",
248        value,
249    )
250
251    # expand 'SR' if no other street types
252    value = sr_comp.sub("State Route", value)
253    return value.strip(" .")

Bundle most common abbreviation expansion functions.

>>> abbrs("St. Francis")
"Saint Francis"
>>> abbrs("E St.")
"E Street"
>>> abbrs("E Sewell St")
"East Sewell Street"
Arguments:
  • value (str): String to expand.
Returns:

str: Expanded string.

def get_title(value: str, single_word: bool = False) -> str:
58def get_title(value: str, single_word: bool = False) -> str:
59    """Fix ALL-CAPS string.
60
61    ```python
62    >>> get_title("PALM BEACH")
63    "Palm Beach"
64    >>> get_title("BOSTON")
65    "BOSTON"
66    >>> get_title("BOSTON", single_word=True)
67    "Boston"
68    ```
69
70    Args:
71        value: String to fix.
72        single_word: Whether the string should be fixed even if it is a single word.
73
74    Returns:
75        str: Fixed string.
76    """
77    if (value.isupper() and " " in value) or (value.isupper() and single_word):
78        return mc_replace(value.title())
79    return value

Fix ALL-CAPS string.

>>> get_title("PALM BEACH")
"Palm Beach"
>>> get_title("BOSTON")
"BOSTON"
>>> get_title("BOSTON", single_word=True)
"Boston"
Arguments:
  • value: String to fix.
  • single_word: Whether the string should be fixed even if it is a single word.
Returns:

str: Fixed string.

def mc_replace(value: str) -> str:
 99def mc_replace(value: str) -> str:
100    """Fix string containing improperly formatted Mc- prefix.
101
102    ```python
103    >>> mc_replace("Fort Mchenry")
104    "Fort McHenry"
105    ```
106
107    Args:
108        value: String to fix.
109
110    Returns:
111        str: Fixed string.
112    """
113    words = []
114    for word in value.split():
115        mc_match = word.partition("Mc")
116        words.append(mc_match[0] + mc_match[1] + mc_match[2].capitalize())
117    return " ".join(words)

Fix string containing improperly formatted Mc- prefix.

>>> mc_replace("Fort Mchenry")
"Fort McHenry"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def us_replace(value: str) -> str:
82def us_replace(value: str) -> str:
83    """Fix string containing improperly formatted US.
84
85    ```python
86    >>> us_replace("U.S. Route 15")
87    "US Route 15"
88    ```
89
90    Args:
91        value: String to fix.
92
93    Returns:
94        str: Fixed string.
95    """
96    return value.replace("U.S.", "US").replace("U. S.", "US").replace("U S ", "US ")

Fix string containing improperly formatted US.

>>> us_replace("U.S. Route 15")
"US Route 15"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def ord_replace(value: str) -> str:
120def ord_replace(value: str) -> str:
121    """Fix string containing improperly capitalized ordinal.
122
123    ```python
124    >>> ord_replace("3Rd St. NW")
125    "3rd St. NW"
126    ```
127
128    Args:
129        value: String to fix.
130
131    Returns:
132        str: Fixed string.
133    """
134    return regex.sub(r"(\b\d+[SNRT][tTdDhH]\b)", lower_match, value)

Fix string containing improperly capitalized ordinal.

>>> ord_replace("3Rd St. NW")
"3rd St. NW"
Arguments:
  • value: String to fix.
Returns:

str: Fixed string.

def remove_br_unicode(old: str) -> str:
256def remove_br_unicode(old: str) -> str:
257    """Clean the input string before sending to parser by removing newlines and unicode.
258
259    Args:
260        old (str): String to clean.
261
262    Returns:
263        str: Cleaned string.
264    """
265    old = regex.sub(r"<br ?/>", ",", old)
266    return regex.sub(r"[^\x00-\x7F\n\r\t]", "", old)  # remove unicode

Clean the input string before sending to parser by removing newlines and unicode.

Arguments:
  • old (str): String to clean.
Returns:

str: Cleaned string.