root 5 years ago
parent 6aefff0af3
commit 246e7cd5e9

@ -23,4 +23,3 @@ To save output to FILE, use 'nohup COMMAND > FILE
nohup command > myout.file 2>&1 & nohup command > myout.file 2>&1 &
``` ```

@ -38,7 +38,8 @@ def digital_to_chinese(digital):
return last_str return last_str
number = float(input("输入需要转换的数字:")) # number = float(input("输入需要转换的数字:"))
number = float(4650)
if __name__ == '__main__': if __name__ == '__main__':
digital_to_chinese(number) digital_to_chinese(number)

@ -0,0 +1,117 @@
import re
import csv
import urllib.request
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import xlrd
import time
dots = []
def read_excel_file():
loc = "dots.xls"
wb = xlrd.open_workbook(loc)
sheet = wb.sheet_by_index(0)
sheet.cell_value(0, 0)
for i in range(1, 5):
dot = str(sheet.cell_value(i, 0)).replace(".0", "")
dots.append(dot)
def crawl_data(url):
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
html = urlopen(req).read()
bs = BeautifulSoup(html, "html.parser")
bold_texts = bs.find_all("b")
for b in bold_texts:
try:
date = (
re.search(
"The information below reflects the content of the FMCSA management information systems as of(.*).",
b.get_text(strip=True, separator=" "),
)
.group(1)
.strip()
)
if len(date) > 11:
date = date.split(".", 1)[0]
print(date)
except AttributeError:
pass
information = bs.find("center").get_text(strip=True, separator=" ")
operating = re.search("Operating Status:(.*)Out", information).group(1).strip()
legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip()
physical_address = (
re.search("Physical Address:(.*)Phone", information).group(1).strip()
)
mailing_address = (
re.search("Mailing Address:(.*)USDOT", information).group(1).strip()
)
usdot_address = (
re.search("USDOT Number:(.*)State Carrier ID Number", information)
.group(1)
.strip()
)
power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip()
drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip()
write_csv(
date,
operating,
legal_name,
physical_address,
mailing_address,
usdot_address,
power_units,
drivers,
)
def write_csv(
date,
operating,
legal_name,
physical_address,
mailing_address,
usdot_address,
power_units,
drivers,
):
with open(
usdot_address + ".csv", mode="w", newline="", encoding="utf-8"
) as csv_file:
fieldnames = [
"Date",
"Operating Status",
"Legal_Name",
"Physical Address",
"Mailing Address",
"Power Units",
"Drivers",
]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(
{
"Date": date,
"Operating Status": operating,
"Legal_Name": legal_name,
"Physical Address": physical_address,
"Mailing Address": mailing_address,
"Power Units": power_units,
"Drivers": drivers,
}
)
read_excel_file()
print(dots)
for dot in dots:
crawl_data(
"https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string="
+ dot
)
time.sleep(5)

@ -0,0 +1,55 @@
# Python多线程例子来示例加线程锁
# 1。使用线程定义一个子类。线程类
# 2。实例化子类并触发线程
# 3。在线程的运行方法中实现锁
import threading
import datetime
exitFlag = 0
class myThread(threading.Thread):
def __init__(self, name, counter):
threading.Thread.__init__(self)
self.threadID = counter
self.name = name
self.counter = counter
def run(self):
print("\n开始 " + self.name)
# Acquire lock to synchronize thread
threadLock.acquire()
print_date(self.name, self.counter)
# Release lock for the next thread
threadLock.release()
print("退出 " + self.name)
def print_date(threadName, counter):
datefields = []
today = datetime.date.today()
datefields.append(today)
print("{}[{}]: {}".format(threadName, counter, datefields[0]))
threadLock = threading.Lock()
threads = []
# Create new threads
thread1 = myThread("线程", 1)
thread2 = myThread("线程", 2)
# Start new Threads
thread1.start()
thread2.start()
# Add threads to thread list
threads.append(thread1)
threads.append(thread2)
# Wait for all threads to complete
for thread in threads:
thread.join()
print("\n退出程序!!!")

@ -39,6 +39,10 @@ collection = db.arc_AdminConf
# 以product_id升序创建索引 # 以product_id升序创建索引
# create_index = collection.create_index([('product_id', pymongo.ASCENDING)], unique=True) # create_index = collection.create_index([('product_id', pymongo.ASCENDING)], unique=True)
# 打印集合索引
for index in collection.list_indexes():
pprint(index)
# 打印集合索引信息 # 打印集合索引信息
# pprint(sorted(list(collection.index_information()))) # pprint(sorted(list(collection.index_information())))

@ -171,7 +171,7 @@ WebDriver都将延迟 driver.get() 的响应或 driver.navigate().to() 的调用
# normal默认加载策略 # normal默认加载策略
# WebDriver等待整个页面的加载设置为normal时WebDriver保持等待直到返回load事件 # WebDriver等待整个页面的加载设置为normal时WebDriver保持等待直到返回load事件
options = Options() options = Options()
options.page_load_strategy = 'normal' options.page_load_strategy = "normal"
browser = webdriver.Chrome(options=options) browser = webdriver.Chrome(options=options)
browser.get("xxx.com") browser.get("xxx.com")
browser.quit() browser.quit()
@ -179,7 +179,7 @@ browser.quit()
# eager加载策略 # eager加载策略
# WebDriver保持等待并直到完全加载并解析了html文件忽略css样式表、图片和subframes的加载 # WebDriver保持等待并直到完全加载并解析了html文件忽略css样式表、图片和subframes的加载
# 设置为eager时保持等待直到返回DOMContentLoaded事件 # 设置为eager时保持等待直到返回DOMContentLoaded事件
options.page_load_strategy = 'eager' options.page_load_strategy = "eager"
# none加载策略 # none加载策略
# WebDriver仅等待至初始页面下载完成 # WebDriver仅等待至初始页面下载完成
@ -200,7 +200,7 @@ search_box = search_src.find_element_by_name("q")
search_box.send_keys("searchconent") search_box.send_keys("searchconent")
# 从父元素的上下文查找匹配子webelement的列表 # 从父元素的上下文查找匹配子webelement的列表
element = browser.find_element_by_tag_name('div') element = browser.find_element_by_tag_name("div")
sub_elements = element.find_element_by_tag_name("p") sub_elements = element.find_element_by_tag_name("p")
for ele in sub_elements: for ele in sub_elements:
print(ele.text) print(ele.text)
@ -263,14 +263,16 @@ webdriver.ActionChains(browser).drag_and_drop(sourceEle,targetEle).perform()
# 在一个元素点击并按住,然后移动一定的偏移量 # 在一个元素点击并按住,然后移动一定的偏移量
targetEleXOffset = targetEle.location.get("x") targetEleXOffset = targetEle.location.get("x")
targetEleYOffset = targetEle.location.get("y") targetEleYOffset = targetEle.location.get("y")
webdriver.ActionChains(browser).drag_and_drop_by_offset(sourceEle, targetEleXOffset, targetEleYOffset).perform() webdriver.ActionChains(browser).drag_and_drop_by_offset(
sourceEle, targetEleXOffset, targetEleYOffset
).perform()
# 释放按下的鼠标左键如果webelement移动了将自动释放在给定webelement上按下的鼠标左键 # 释放按下的鼠标左键如果webelement移动了将自动释放在给定webelement上按下的鼠标左键
webdriver.ActionChains(browser).release().perform() webdriver.ActionChains(browser).release().perform()
# 添加cookies # 添加cookies
# 常用于将cookie添加到当前访问的上下文中. 添加Cookie仅接受一组已定义的可序列化JSON对象 # 常用于将cookie添加到当前访问的上下文中. 添加Cookie仅接受一组已定义的可序列化JSON对象
browser.get('xx.com') browser.get("xx.com")
browser.add_cookie({"name": "key", "value": "value"}) browser.add_cookie({"name": "key", "value": "value"})
# 获取cookie # 获取cookie
@ -285,9 +287,9 @@ browser.delete_all_cookies()
# Lax,将Cookie sameSite属性设置为Lax时该Cookie将与第三方网站发起的GET请求一起发送 # Lax,将Cookie sameSite属性设置为Lax时该Cookie将与第三方网站发起的GET请求一起发送
# 目前此功能已嵌入chrome80+,适用于selenium4+ # 目前此功能已嵌入chrome80+,适用于selenium4+
driver.add_cookie({"name": "foo", "value": "value", 'sameSite': 'Strict'}) driver.add_cookie({"name": "foo", "value": "value", "sameSite": "Strict"})
driver.add_cookie({"name": "foo1", "value": "value", 'sameSite': 'Lax'}) driver.add_cookie({"name": "foo1", "value": "value", "sameSite": "Lax"})
cookie1 = driver.get_cookie('foo') cookie1 = driver.get_cookie("foo")
cookie2 = driver.get_cookie('foo1') cookie2 = driver.get_cookie("foo1")
print(cookie1) print(cookie1)
print(cookie2) print(cookie2)

@ -0,0 +1,127 @@
import time
import datetime
import re
import xlwt
from xlwt import Workbook
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class Bolagsverket:
def __init__(self):
self.bot = webdriver.Firefox(
executable_path="E:/geckodriver"
)
def navigate_and_crawl(self):
bot = self.bot
bot.get("https://poit.bolagsverket.se/poit/PublikPoitIn.do")
time.sleep(5)
bot.find_element_by_id("nav1-2").click()
time.sleep(5)
bot.find_element_by_tag_name("form").find_element_by_tag_name("a").click()
time.sleep(5)
search_form = bot.find_element_by_tag_name("form")
search_form.find_element_by_xpath(
"//select[@id='tidsperiod']/option[text()='Annan period']"
).click()
wait = WebDriverWait(bot, 10)
input_from = wait.until(
EC.element_to_be_clickable((By.XPATH, "//input[@id='from']"))
)
input_from.send_keys("2019-09-23")
# input_from.send_keys(str(datetime.date.today()-datetime.timedelta(1)))
input_to = wait.until(
EC.element_to_be_clickable((By.XPATH, "//input[@id='tom']"))
)
input_to.send_keys("2019-09-24")
# input_to.send_keys(str(datetime.date.today()))
time.sleep(5)
amnesomrade = wait.until(
EC.element_to_be_clickable((By.XPATH, "//select[@id='amnesomrade']"))
)
amnesomrade.find_element_by_xpath(
"//select[@id='amnesomrade']/option[text()='Bolagsverkets registreringar']"
).click()
time.sleep(5)
kungorelserubrik = wait.until(
EC.element_to_be_clickable((By.XPATH, "//select[@id='kungorelserubrik']"))
)
kungorelserubrik.find_element_by_xpath(
"//select[@id='kungorelserubrik']/option[text()='Aktiebolagsregistret']"
).click()
time.sleep(5)
underrubrik = wait.until(
EC.element_to_be_clickable((By.XPATH, "//select[@id='underrubrik']"))
)
underrubrik.find_element_by_xpath(
"//select[@id='underrubrik']/option[text()='Nyregistreringar']"
).click()
# Search Button
button_sok = wait.until(
EC.element_to_be_clickable((By.XPATH, "//input[@id='SokKungorelse']"))
)
button_sok.click()
time.sleep(5)
number_of_pages = bot.find_element_by_xpath(
"//div[@class='gotopagediv']/em[@class='gotopagebuttons']"
).text.split("av", 1)[1]
number_of_pages.strip().replace(" ", "")
number_of_results = bot.find_elements_by_xpath("//table/tbody/tr")
wb = Workbook()
for page in range(int(number_of_pages)):
sheet = wb.add_sheet("Sheet" + str(page))
style = xlwt.easyxf("font: bold 1")
sheet.write(0, 0, "Post Address", style)
sheet.write(0, 1, "Bildat", style)
sheet.write(0, 2, "Foretagsnamn", style)
sheet.write(0, 3, "Email", style)
for i in range(len(number_of_results)):
result = bot.find_elements_by_xpath("//table/tbody/tr")[i]
link = result.find_element_by_tag_name("a")
bot.execute_script("arguments[0].click();", link)
time.sleep(5)
information = [bot.find_element_by_class_name("kungtext").text]
try:
postaddress = re.search("Postadress:(.*),", information[0])
sheet.write(i + 1, 0, str(postaddress.group(1)))
bildat = re.search("Bildat:(.*)\n", information[0])
sheet.write(i + 1, 1, str(bildat.group(1)))
foretagsnamn = re.search("Företagsnamn:(.*)\n", information[0])
sheet.write(i + 1, 2, str(foretagsnamn.group(1)))
email = re.search("E-post:(.*)\n", information[0])
sheet.write(i + 1, 3, str(email.group(1)))
print(
postaddress.group(1),
bildat.group(1),
foretagsnamn.group(1),
email.group(1),
)
except AttributeError as e:
print("Email is null")
sheet.write(i + 1, 3, "null")
pass
bot.back()
time.sleep(5)
wb.save("emails.xls")
print("Going to next page ...")
button_next = wait.until(
EC.element_to_be_clickable((By.XPATH, "//input/[@id='movenextTop']"))
)
button_next.click()
time.sleep(5)
bot = Bolagsverket()
bot.navigate_and_crawl()
Loading…
Cancel
Save