mirror of https://github.com/Black-Gold/Learn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
118 lines
3.1 KiB
118 lines
3.1 KiB
import re
|
|
import csv
|
|
import urllib.request
|
|
from urllib.request import urlopen, Request
|
|
from bs4 import BeautifulSoup
|
|
import xlrd
|
|
import time
|
|
|
|
dots = []
|
|
|
|
|
|
def read_excel_file():
|
|
loc = "dots.xls"
|
|
wb = xlrd.open_workbook(loc)
|
|
sheet = wb.sheet_by_index(0)
|
|
sheet.cell_value(0, 0)
|
|
for i in range(1, 5):
|
|
dot = str(sheet.cell_value(i, 0)).replace(".0", "")
|
|
dots.append(dot)
|
|
|
|
|
|
def crawl_data(url):
|
|
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
|
html = urlopen(req).read()
|
|
bs = BeautifulSoup(html, "html.parser")
|
|
bold_texts = bs.find_all("b")
|
|
for b in bold_texts:
|
|
try:
|
|
date = (
|
|
re.search(
|
|
"The information below reflects the content of the FMCSA management information systems as of(.*).",
|
|
b.get_text(strip=True, separator=" "),
|
|
)
|
|
.group(1)
|
|
.strip()
|
|
)
|
|
if len(date) > 11:
|
|
date = date.split(".", 1)[0]
|
|
print(date)
|
|
except AttributeError:
|
|
pass
|
|
|
|
information = bs.find("center").get_text(strip=True, separator=" ")
|
|
|
|
operating = re.search("Operating Status:(.*)Out", information).group(1).strip()
|
|
legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip()
|
|
physical_address = (
|
|
re.search("Physical Address:(.*)Phone", information).group(1).strip()
|
|
)
|
|
mailing_address = (
|
|
re.search("Mailing Address:(.*)USDOT", information).group(1).strip()
|
|
)
|
|
usdot_address = (
|
|
re.search("USDOT Number:(.*)State Carrier ID Number", information)
|
|
.group(1)
|
|
.strip()
|
|
)
|
|
power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip()
|
|
drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip()
|
|
|
|
write_csv(
|
|
date,
|
|
operating,
|
|
legal_name,
|
|
physical_address,
|
|
mailing_address,
|
|
usdot_address,
|
|
power_units,
|
|
drivers,
|
|
)
|
|
|
|
|
|
def write_csv(
|
|
date,
|
|
operating,
|
|
legal_name,
|
|
physical_address,
|
|
mailing_address,
|
|
usdot_address,
|
|
power_units,
|
|
drivers,
|
|
):
|
|
with open(
|
|
usdot_address + ".csv", mode="w", newline="", encoding="utf-8"
|
|
) as csv_file:
|
|
fieldnames = [
|
|
"Date",
|
|
"Operating Status",
|
|
"Legal_Name",
|
|
"Physical Address",
|
|
"Mailing Address",
|
|
"Power Units",
|
|
"Drivers",
|
|
]
|
|
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerow(
|
|
{
|
|
"Date": date,
|
|
"Operating Status": operating,
|
|
"Legal_Name": legal_name,
|
|
"Physical Address": physical_address,
|
|
"Mailing Address": mailing_address,
|
|
"Power Units": power_units,
|
|
"Drivers": drivers,
|
|
}
|
|
)
|
|
|
|
|
|
read_excel_file()
|
|
print(dots)
|
|
for dot in dots:
|
|
crawl_data(
|
|
"https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string="
|
|
+ dot
|
|
)
|
|
time.sleep(5)
|