Learn/Py3Scripts/py_extract_data.py

import re
import csv
import urllib.request
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlrd
import time

dots = []


def read_excel_file():
    loc = "dots.xls"
    wb = xlrd.open_workbook(loc)
    sheet = wb.sheet_by_index(0)
    sheet.cell_value(0, 0)
    for i in range(1, 5):
        dot = str(sheet.cell_value(i, 0)).replace(".0", "")
        dots.append(dot)


def crawl_data(url):
    req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
    html = urlopen(req).read()
    bs = BeautifulSoup(html, "html.parser")
    bold_texts = bs.find_all("b")
    for b in bold_texts:
        try:
            date = (
                re.search(
                    "The information below reflects the content of the FMCSA management information systems as of(.*).",
                    b.get_text(strip=True, separator="  "),
                )
                .group(1)
                .strip()
            )
            if len(date) > 11:
                date = date.split(".", 1)[0]
            print(date)
        except AttributeError:
            pass

    information = bs.find("center").get_text(strip=True, separator="  ")

    operating = re.search("Operating Status:(.*)Out", information).group(1).strip()
    legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip()
    physical_address = (
        re.search("Physical Address:(.*)Phone", information).group(1).strip()
    )
    mailing_address = (
        re.search("Mailing Address:(.*)USDOT", information).group(1).strip()
    )
    usdot_address = (
        re.search("USDOT Number:(.*)State Carrier ID Number", information)
        .group(1)
        .strip()
    )
    power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip()
    drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip()

    write_csv(
        date,
        operating,
        legal_name,
        physical_address,
        mailing_address,
        usdot_address,
        power_units,
        drivers,
    )


def write_csv(
    date,
    operating,
    legal_name,
    physical_address,
    mailing_address,
    usdot_address,
    power_units,
    drivers,
):
    with open(
        usdot_address + ".csv", mode="w", newline="", encoding="utf-8"
    ) as csv_file:
        fieldnames = [
            "Date",
            "Operating Status",
            "Legal_Name",
            "Physical Address",
            "Mailing Address",
            "Power Units",
            "Drivers",
        ]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerow(
            {
                "Date": date,
                "Operating Status": operating,
                "Legal_Name": legal_name,
                "Physical Address": physical_address,
                "Mailing Address": mailing_address,
                "Power Units": power_units,
                "Drivers": drivers,
            }
        )


read_excel_file()
print(dots)
for dot in dots:
    crawl_data(
        "https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string="
        + dot
    )
    time.sleep(5)
5 years ago			`import re`
			`import csv`
			`import urllib.request`
			`from urllib.request import urlopen, Request`
			`from bs4 import BeautifulSoup`
			`import xlrd`
			`import time`

			`dots = []`


			`def read_excel_file():`
			`loc = "dots.xls"`
			`wb = xlrd.open_workbook(loc)`
			`sheet = wb.sheet_by_index(0)`
			`sheet.cell_value(0, 0)`
			`for i in range(1, 5):`
			`dot = str(sheet.cell_value(i, 0)).replace(".0", "")`
			`dots.append(dot)`


			`def crawl_data(url):`
			`req = Request(url, headers={"User-Agent": "Mozilla/5.0"})`
			`html = urlopen(req).read()`
			`bs = BeautifulSoup(html, "html.parser")`
			`bold_texts = bs.find_all("b")`
			`for b in bold_texts:`
			`try:`
			`date = (`
			`re.search(`
			`"The information below reflects the content of the FMCSA management information systems as of(.*).",`
			`b.get_text(strip=True, separator=" "),`
			`)`
			`.group(1)`
			`.strip()`
			`)`
			`if len(date) > 11:`
			`date = date.split(".", 1)[0]`
			`print(date)`
			`except AttributeError:`
			`pass`

			`information = bs.find("center").get_text(strip=True, separator=" ")`

			`operating = re.search("Operating Status:(.*)Out", information).group(1).strip()`
			`legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip()`
			`physical_address = (`
			`re.search("Physical Address:(.*)Phone", information).group(1).strip()`
			`)`
			`mailing_address = (`
			`re.search("Mailing Address:(.*)USDOT", information).group(1).strip()`
			`)`
			`usdot_address = (`
			`re.search("USDOT Number:(.*)State Carrier ID Number", information)`
			`.group(1)`
			`.strip()`
			`)`
			`power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip()`
			`drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip()`

			`write_csv(`
			`date,`
			`operating,`
			`legal_name,`
			`physical_address,`
			`mailing_address,`
			`usdot_address,`
			`power_units,`
			`drivers,`
			`)`


			`def write_csv(`
			`date,`
			`operating,`
			`legal_name,`
			`physical_address,`
			`mailing_address,`
			`usdot_address,`
			`power_units,`
			`drivers,`
			`):`
			`with open(`
			`usdot_address + ".csv", mode="w", newline="", encoding="utf-8"`
			`) as csv_file:`
			`fieldnames = [`
			`"Date",`
			`"Operating Status",`
			`"Legal_Name",`
			`"Physical Address",`
			`"Mailing Address",`
			`"Power Units",`
			`"Drivers",`
			`]`
			`writer = csv.DictWriter(csv_file, fieldnames=fieldnames)`
			`writer.writeheader()`
			`writer.writerow(`
			`{`
			`"Date": date,`
			`"Operating Status": operating,`
			`"Legal_Name": legal_name,`
			`"Physical Address": physical_address,`
			`"Mailing Address": mailing_address,`
			`"Power Units": power_units,`
			`"Drivers": drivers,`
			`}`
			`)`


			`read_excel_file()`
			`print(dots)`
			`for dot in dots:`
			`crawl_data(`
			`"https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string="`
			`+ dot`
			`)`
			`time.sleep(5)`