Learn/Py3Scripts/py_extract_data.py

import re
import csv
import urllib.request
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlrd
import time

dots = []


def read_excel_file():
    loc = "dots.xls"
    wb = xlrd.open_workbook(loc)
    sheet = wb.sheet_by_index(0)
    sheet.cell_value(0, 0)
    for i in range(1, 5):
        dot = str(sheet.cell_value(i, 0)).replace(".0", "")
        dots.append(dot)


def crawl_data(url):
    req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
    html = urlopen(req).read()
    bs = BeautifulSoup(html, "html.parser")
    bold_texts = bs.find_all("b")
    for b in bold_texts:
        try:
            date = (
                re.search(
                    "The information below reflects the content of the FMCSA management information systems as of(.*).",
                    b.get_text(strip=True, separator="  "),
                )
                .group(1)
                .strip()
            )
            if len(date) > 11:
                date = date.split(".", 1)[0]
            print(date)
        except AttributeError:
            pass

    information = bs.find("center").get_text(strip=True, separator="  ")

    operating = re.search("Operating Status:(.*)Out", information).group(1).strip()
    legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip()
    physical_address = (
        re.search("Physical Address:(.*)Phone", information).group(1).strip()
    )
    mailing_address = (
        re.search("Mailing Address:(.*)USDOT", information).group(1).strip()
    )
    usdot_address = (
        re.search("USDOT Number:(.*)State Carrier ID Number", information)
        .group(1)
        .strip()
    )
    power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip()
    drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip()

    write_csv(
        date,
        operating,
        legal_name,
        physical_address,
        mailing_address,
        usdot_address,
        power_units,
        drivers,
    )


def write_csv(
    date,
    operating,
    legal_name,
    physical_address,
    mailing_address,
    usdot_address,
    power_units,
    drivers,
):
    with open(
        usdot_address + ".csv", mode="w", newline="", encoding="utf-8"
    ) as csv_file:
        fieldnames = [
            "Date",
            "Operating Status",
            "Legal_Name",
            "Physical Address",
            "Mailing Address",
            "Power Units",
            "Drivers",
        ]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerow(
            {
                "Date": date,
                "Operating Status": operating,
                "Legal_Name": legal_name,
                "Physical Address": physical_address,
                "Mailing Address": mailing_address,
                "Power Units": power_units,
                "Drivers": drivers,
            }
        )


read_excel_file()
print(dots)
for dot in dots:
    crawl_data(
        "https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string="
        + dot
    )
    time.sleep(5)