You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
3.1 KiB

5 years ago
import re
import csv
import urllib.request
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlrd
import time
dots = []
def read_excel_file():
loc = "dots.xls"
wb = xlrd.open_workbook(loc)
sheet = wb.sheet_by_index(0)
sheet.cell_value(0, 0)
for i in range(1, 5):
dot = str(sheet.cell_value(i, 0)).replace(".0", "")
dots.append(dot)
def crawl_data(url):
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
html = urlopen(req).read()
bs = BeautifulSoup(html, "html.parser")
bold_texts = bs.find_all("b")
for b in bold_texts:
try:
date = (
re.search(
"The information below reflects the content of the FMCSA management information systems as of(.*).",
b.get_text(strip=True, separator=" "),
)
.group(1)
.strip()
)
if len(date) > 11:
date = date.split(".", 1)[0]
print(date)
except AttributeError:
pass
information = bs.find("center").get_text(strip=True, separator=" ")
operating = re.search("Operating Status:(.*)Out", information).group(1).strip()
legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip()
physical_address = (
re.search("Physical Address:(.*)Phone", information).group(1).strip()
)
mailing_address = (
re.search("Mailing Address:(.*)USDOT", information).group(1).strip()
)
usdot_address = (
re.search("USDOT Number:(.*)State Carrier ID Number", information)
.group(1)
.strip()
)
power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip()
drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip()
write_csv(
date,
operating,
legal_name,
physical_address,
mailing_address,
usdot_address,
power_units,
drivers,
)
def write_csv(
date,
operating,
legal_name,
physical_address,
mailing_address,
usdot_address,
power_units,
drivers,
):
with open(
usdot_address + ".csv", mode="w", newline="", encoding="utf-8"
) as csv_file:
fieldnames = [
"Date",
"Operating Status",
"Legal_Name",
"Physical Address",
"Mailing Address",
"Power Units",
"Drivers",
]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(
{
"Date": date,
"Operating Status": operating,
"Legal_Name": legal_name,
"Physical Address": physical_address,
"Mailing Address": mailing_address,
"Power Units": power_units,
"Drivers": drivers,
}
)
read_excel_file()
print(dots)
for dot in dots:
crawl_data(
"https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string="
+ dot
)
time.sleep(5)