You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
3.1 KiB

import re
import csv
import urllib.request
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlrd
import time
dots = []
def read_excel_file():
loc = "dots.xls"
wb = xlrd.open_workbook(loc)
sheet = wb.sheet_by_index(0)
sheet.cell_value(0, 0)
for i in range(1, 5):
dot = str(sheet.cell_value(i, 0)).replace(".0", "")
dots.append(dot)
def crawl_data(url):
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
html = urlopen(req).read()
bs = BeautifulSoup(html, "html.parser")
bold_texts = bs.find_all("b")
for b in bold_texts:
try:
date = (
re.search(
"The information below reflects the content of the FMCSA management information systems as of(.*).",
b.get_text(strip=True, separator=" "),
)
.group(1)
.strip()
)
if len(date) > 11:
date = date.split(".", 1)[0]
print(date)
except AttributeError:
pass
information = bs.find("center").get_text(strip=True, separator=" ")
operating = re.search("Operating Status:(.*)Out", information).group(1).strip()
legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip()
physical_address = (
re.search("Physical Address:(.*)Phone", information).group(1).strip()
)
mailing_address = (
re.search("Mailing Address:(.*)USDOT", information).group(1).strip()
)
usdot_address = (
re.search("USDOT Number:(.*)State Carrier ID Number", information)
.group(1)
.strip()
)
power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip()
drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip()
write_csv(
date,
operating,
legal_name,
physical_address,
mailing_address,
usdot_address,
power_units,
drivers,
)
def write_csv(
date,
operating,
legal_name,
physical_address,
mailing_address,
usdot_address,
power_units,
drivers,
):
with open(
usdot_address + ".csv", mode="w", newline="", encoding="utf-8"
) as csv_file:
fieldnames = [
"Date",
"Operating Status",
"Legal_Name",
"Physical Address",
"Mailing Address",
"Power Units",
"Drivers",
]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(
{
"Date": date,
"Operating Status": operating,
"Legal_Name": legal_name,
"Physical Address": physical_address,
"Mailing Address": mailing_address,
"Power Units": power_units,
"Drivers": drivers,
}
)
read_excel_file()
print(dots)
for dot in dots:
crawl_data(
"https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string="
+ dot
)
time.sleep(5)