6 years ago · 246e7cd5e9
parent 6aefff0af3
commit 246e7cd5e9
7 changed files with 387 additions and 82 deletions
--- a/Linux_man_cn/nohup.md
+++ b/Linux_man_cn/nohup.md
@ -23,4 +23,3 @@ To save output to FILE, use 'nohup COMMAND > FILE
 nohup command > myout.file 2>&1 &
 ```
--- a/Py3Scripts/NumberToChinese.py
+++ b/Py3Scripts/NumberToChinese.py
@ -38,7 +38,8 @@ def digital_to_chinese(digital):
    return last_str
-number = float(input("输入需要转换的数字："))
+# number = float(input("输入需要转换的数字："))
 number = float(4650)
 if __name__ == '__main__':
    digital_to_chinese(number)
--- a/Py3Scripts/py_extract_data.py
+++ b/Py3Scripts/py_extract_data.py
@ -0,0 +1,117 @@
 import re
 import csv
 import urllib.request
 from urllib.request import urlopen, Request
 from bs4 import BeautifulSoup
 import xlrd
 import time
 dots = []
 def read_excel_file():
    loc = "dots.xls"
    wb = xlrd.open_workbook(loc)
    sheet = wb.sheet_by_index(0)
    sheet.cell_value(0, 0)
    for i in range(1, 5):
        dot = str(sheet.cell_value(i, 0)).replace(".0", "")
        dots.append(dot)
 def crawl_data(url):
    req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
    html = urlopen(req).read()
    bs = BeautifulSoup(html, "html.parser")
    bold_texts = bs.find_all("b")
    for b in bold_texts:
        try:
            date = (
                re.search(
                    "The information below reflects the content of the FMCSA management information systems as of(.*).",
                    b.get_text(strip=True, separator="  "),
                )
                .group(1)
                .strip()
            )
            if len(date) > 11:
                date = date.split(".", 1)[0]
            print(date)
        except AttributeError:
            pass
    information = bs.find("center").get_text(strip=True, separator="  ")
    operating = re.search("Operating Status:(.*)Out", information).group(1).strip()
    legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip()
    physical_address = (
        re.search("Physical Address:(.*)Phone", information).group(1).strip()
    )
    mailing_address = (
        re.search("Mailing Address:(.*)USDOT", information).group(1).strip()
    )
    usdot_address = (
        re.search("USDOT Number:(.*)State Carrier ID Number", information)
        .group(1)
        .strip()
    )
    power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip()
    drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip()
    write_csv(
        date,
        operating,
        legal_name,
        physical_address,
        mailing_address,
        usdot_address,
        power_units,
        drivers,
    )
 def write_csv(
    date,
    operating,
    legal_name,
    physical_address,
    mailing_address,
    usdot_address,
    power_units,
    drivers,
 ):
    with open(
        usdot_address + ".csv", mode="w", newline="", encoding="utf-8"
    ) as csv_file:
        fieldnames = [
            "Date",
            "Operating Status",
            "Legal_Name",
            "Physical Address",
            "Mailing Address",
            "Power Units",
            "Drivers",
        ]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerow(
            {
                "Date": date,
                "Operating Status": operating,
                "Legal_Name": legal_name,
                "Physical Address": physical_address,
                "Mailing Address": mailing_address,
                "Power Units": power_units,
                "Drivers": drivers,
            }
        )
 read_excel_file()
 print(dots)
 for dot in dots:
    crawl_data(
        "https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string="
        + dot
    )
    time.sleep(5)
--- a/Py3Scripts/py_multithread.py
+++ b/Py3Scripts/py_multithread.py
@ -0,0 +1,55 @@
 # Python多线程例子来示例加线程锁
 # 1。使用线程定义一个子类。线程类
 # 2。实例化子类并触发线程
 # 3。在线程的运行方法中实现锁
 import threading
 import datetime
 exitFlag = 0
 class myThread(threading.Thread):
    def __init__(self, name, counter):
        threading.Thread.__init__(self)
        self.threadID = counter
        self.name = name
        self.counter = counter
    def run(self):
        print("\n开始 " + self.name)
        # Acquire lock to synchronize thread
        threadLock.acquire()
        print_date(self.name, self.counter)
        # Release lock for the next thread
        threadLock.release()
        print("退出 " + self.name)
 def print_date(threadName, counter):
    datefields = []
    today = datetime.date.today()
    datefields.append(today)
    print("{}[{}]: {}".format(threadName, counter, datefields[0]))
 threadLock = threading.Lock()
 threads = []
 # Create new threads
 thread1 = myThread("线程", 1)
 thread2 = myThread("线程", 2)
 # Start new Threads
 thread1.start()
 thread2.start()
 # Add threads to thread list
 threads.append(thread1)
 threads.append(thread2)
 # Wait for all threads to complete
 for thread in threads:
    thread.join()
 print("\n退出程序!!!")
--- a/Py3Scripts/pymongo_example.py
+++ b/Py3Scripts/pymongo_example.py
@ -39,6 +39,10 @@ collection = db.arc_AdminConf
 # 以product_id升序创建索引
 # create_index = collection.create_index([('product_id', pymongo.ASCENDING)], unique=True)
 # 打印集合索引
 for index in collection.list_indexes():
    pprint(index)
 # 打印集合索引信息
 # pprint(sorted(list(collection.index_information())))
--- a/Py3Scripts/selenium_examples.py
+++ b/Py3Scripts/selenium_examples.py
@ -171,7 +171,7 @@ WebDriver都将延迟 driver.get() 的响应或 driver.navigate().to() 的调用
 # normal默认加载策略
 # WebDriver等待整个页面的加载，设置为normal时，WebDriver保持等待直到返回load事件
 options = Options()
-options.page_load_strategy = 'normal'
+options.page_load_strategy = "normal"
 browser = webdriver.Chrome(options=options)
 browser.get("xxx.com")
 browser.quit()
@ -179,7 +179,7 @@ browser.quit()
 # eager加载策略
 # WebDriver保持等待并直到完全加载并解析了html文件，忽略css样式表、图片和subframes的加载
 # 设置为eager时，保持等待直到返回DOMContentLoaded事件
-options.page_load_strategy = 'eager'
+options.page_load_strategy = "eager"
 # none加载策略
 # WebDriver仅等待至初始页面下载完成
@ -200,7 +200,7 @@ search_box = search_src.find_element_by_name("q")
 search_box.send_keys("searchconent")
 # 从父元素的上下文查找匹配子webelement的列表
-element = browser.find_element_by_tag_name('div')
+element = browser.find_element_by_tag_name("div")
 sub_elements = element.find_element_by_tag_name("p")
 for ele in sub_elements:
    print(ele.text)
@ -252,25 +252,27 @@ webdriver.ActionChains(browser).move_to_element(gmailLink).perform()
 xOffset = 100
 yOffset = 100
 # 将鼠标移动到指定坐标位置，可移到窗口之外
-webdriver.ActionChains(browser).move_by_offset(xOffset,yOffset).perform()
+webdriver.ActionChains(browser).move_by_offset(xOffset, yOffset).perform()
 # 在一个元素点击并按住，然后移到另一个元素
 sourceEle = driver.find_element_by_id("draggable")
-targetEle  = driver.find_element_by_id("droppable")
+targetEle = driver.find_element_by_id("droppable")
 # 鼠标从sourceEle移动到targetEle元素
-webdriver.ActionChains(browser).drag_and_drop(sourceEle,targetEle).perform()
+webdriver.ActionChains(browser).drag_and_drop(sourceEle, targetEle).perform()
 # 在一个元素点击并按住，然后移动一定的偏移量
 targetEleXOffset = targetEle.location.get("x")
 targetEleYOffset = targetEle.location.get("y")
-webdriver.ActionChains(browser).drag_and_drop_by_offset(sourceEle, targetEleXOffset, targetEleYOffset).perform()
+webdriver.ActionChains(browser).drag_and_drop_by_offset(
    sourceEle, targetEleXOffset, targetEleYOffset
 ).perform()
 # 释放按下的鼠标左键，如果webelement移动了，将自动释放在给定webelement上按下的鼠标左键
 webdriver.ActionChains(browser).release().perform()
 # 添加cookies
 # 常用于将cookie添加到当前访问的上下文中. 添加Cookie仅接受一组已定义的可序列化JSON对象
-browser.get('xx.com')
+browser.get("xx.com")
 browser.add_cookie({"name": "key", "value": "value"})
 # 获取cookie
@ -285,9 +287,9 @@ browser.delete_all_cookies()
 # Lax,将Cookie sameSite属性设置为Lax时，该Cookie将与第三方网站发起的GET请求一起发送
 # 目前此功能已嵌入chrome80+,适用于selenium4+
-driver.add_cookie({"name": "foo", "value": "value", 'sameSite': 'Strict'})
+driver.add_cookie({"name": "foo", "value": "value", "sameSite": "Strict"})
-driver.add_cookie({"name": "foo1", "value": "value", 'sameSite': 'Lax'})
+driver.add_cookie({"name": "foo1", "value": "value", "sameSite": "Lax"})
-cookie1 = driver.get_cookie('foo')
+cookie1 = driver.get_cookie("foo")
-cookie2 = driver.get_cookie('foo1')
+cookie2 = driver.get_cookie("foo1")
 print(cookie1)
 print(cookie2)
--- a/Py3Scripts/selenium_upwork_data.py
+++ b/Py3Scripts/selenium_upwork_data.py
@ -0,0 +1,127 @@
 import time
 import datetime
 import re
 import xlwt
 from xlwt import Workbook
 from selenium import webdriver
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 class Bolagsverket:
    def __init__(self):
        self.bot = webdriver.Firefox(
            executable_path="E:/geckodriver"
        )
    def navigate_and_crawl(self):
        bot = self.bot
        bot.get("https://poit.bolagsverket.se/poit/PublikPoitIn.do")
        time.sleep(5)
        bot.find_element_by_id("nav1-2").click()
        time.sleep(5)
        bot.find_element_by_tag_name("form").find_element_by_tag_name("a").click()
        time.sleep(5)
        search_form = bot.find_element_by_tag_name("form")
        search_form.find_element_by_xpath(
            "//select[@id='tidsperiod']/option[text()='Annan period']"
        ).click()
        wait = WebDriverWait(bot, 10)
        input_from = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//input[@id='from']"))
        )
        input_from.send_keys("2019-09-23")
        # input_from.send_keys(str(datetime.date.today()-datetime.timedelta(1)))
        input_to = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//input[@id='tom']"))
        )
        input_to.send_keys("2019-09-24")
        # input_to.send_keys(str(datetime.date.today()))
        time.sleep(5)
        amnesomrade = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//select[@id='amnesomrade']"))
        )
        amnesomrade.find_element_by_xpath(
            "//select[@id='amnesomrade']/option[text()='Bolagsverkets registreringar']"
        ).click()
        time.sleep(5)
        kungorelserubrik = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//select[@id='kungorelserubrik']"))
        )
        kungorelserubrik.find_element_by_xpath(
            "//select[@id='kungorelserubrik']/option[text()='Aktiebolagsregistret']"
        ).click()
        time.sleep(5)
        underrubrik = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//select[@id='underrubrik']"))
        )
        underrubrik.find_element_by_xpath(
            "//select[@id='underrubrik']/option[text()='Nyregistreringar']"
        ).click()
        # Search Button
        button_sok = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//input[@id='SokKungorelse']"))
        )
        button_sok.click()
        time.sleep(5)
        number_of_pages = bot.find_element_by_xpath(
            "//div[@class='gotopagediv']/em[@class='gotopagebuttons']"
        ).text.split("av", 1)[1]
        number_of_pages.strip().replace(" ", "")
        number_of_results = bot.find_elements_by_xpath("//table/tbody/tr")
        wb = Workbook()
        for page in range(int(number_of_pages)):
            sheet = wb.add_sheet("Sheet" + str(page))
            style = xlwt.easyxf("font: bold 1")
            sheet.write(0, 0, "Post Address", style)
            sheet.write(0, 1, "Bildat", style)
            sheet.write(0, 2, "Foretagsnamn", style)
            sheet.write(0, 3, "Email", style)
            for i in range(len(number_of_results)):
                result = bot.find_elements_by_xpath("//table/tbody/tr")[i]
                link = result.find_element_by_tag_name("a")
                bot.execute_script("arguments[0].click();", link)
                time.sleep(5)
                information = [bot.find_element_by_class_name("kungtext").text]
                try:
                    postaddress = re.search("Postadress:(.*),", information[0])
                    sheet.write(i + 1, 0, str(postaddress.group(1)))
                    bildat = re.search("Bildat:(.*)\n", information[0])
                    sheet.write(i + 1, 1, str(bildat.group(1)))
                    foretagsnamn = re.search("Företagsnamn:(.*)\n", information[0])
                    sheet.write(i + 1, 2, str(foretagsnamn.group(1)))
                    email = re.search("E-post:(.*)\n", information[0])
                    sheet.write(i + 1, 3, str(email.group(1)))
                    print(
                        postaddress.group(1),
                        bildat.group(1),
                        foretagsnamn.group(1),
                        email.group(1),
                    )
                except AttributeError as e:
                    print("Email is null")
                    sheet.write(i + 1, 3, "null")
                    pass
                bot.back()
                time.sleep(5)
                wb.save("emails.xls")
            print("Going to next page ...")
            button_next = wait.until(
                EC.element_to_be_clickable((By.XPATH, "//input/[@id='movenextTop']"))
            )
            button_next.click()
            time.sleep(5)
 bot = Bolagsverket()
 bot.navigate_and_crawl()
`@ -23,4 +23,3 @@ To save output to FILE, use 'nohup COMMAND > FILE`
	`nohup command > myout.file 2>&1 &`	`nohup command > myout.file 2>&1 &`

	```	```