From 246e7cd5e9627b76ae7c5696d8748b8a46141a34 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 30 Mar 2020 18:38:01 +0800 Subject: [PATCH] --- Linux_man_cn/nohup.md | 51 ++++++------ Py3Scripts/NumberToChinese.py | 89 ++++++++++---------- Py3Scripts/py_extract_data.py | 117 ++++++++++++++++++++++++++ Py3Scripts/py_multithread.py | 55 +++++++++++++ Py3Scripts/pymongo_example.py | 4 + Py3Scripts/selenium_examples.py | 26 +++--- Py3Scripts/selenium_upwork_data.py | 127 +++++++++++++++++++++++++++++ 7 files changed, 387 insertions(+), 82 deletions(-) create mode 100644 Py3Scripts/py_extract_data.py create mode 100644 Py3Scripts/py_multithread.py create mode 100644 Py3Scripts/selenium_upwork_data.py diff --git a/Linux_man_cn/nohup.md b/Linux_man_cn/nohup.md index 3447c7e..00eefce 100644 --- a/Linux_man_cn/nohup.md +++ b/Linux_man_cn/nohup.md @@ -1,26 +1,25 @@ -# **nohup** - -## 说明 - -**nohup命令** 可以将程序以忽略挂起信号的方式运行起来,被运行的程序的输出信息将不会显示到终端 - -无论是否将 nohup 命令的输出重定向到终端,输出都将附加到当前目录的 nohup.out 文件中。如果当前目录的 nohup.out 文件不可写,输出重定向 -到`$HOME/nohup.out`文件中。如果没有文件能创建或打开以用于追加,那么 command 参数指定的命令不可调用。如果标准错误是一个终端,那么把 -指定的命令写给标准错误的所有输出作为标准输出重定向到相同的文件描述符 - -## 选项 - -```markdown -If standard input is a terminal, redirect it from /dev/null -If standard output is a terminal, append output to 'nohup.out' if possible,'$HOME/nohup.out' otherwise -If standard error is a terminal, redirect it to standard output -To save output to FILE, use 'nohup COMMAND > FILE -``` - -## 实例 - -```bash -nohup command > myout.file 2>&1 & - -``` - +# **nohup** + +## 说明 + +**nohup命令** 可以将程序以忽略挂起信号的方式运行起来,被运行的程序的输出信息将不会显示到终端 + +无论是否将 nohup 命令的输出重定向到终端,输出都将附加到当前目录的 nohup.out 文件中。如果当前目录的 nohup.out 文件不可写,输出重定向 +到`$HOME/nohup.out`文件中。如果没有文件能创建或打开以用于追加,那么 command 参数指定的命令不可调用。如果标准错误是一个终端,那么把 +指定的命令写给标准错误的所有输出作为标准输出重定向到相同的文件描述符 + +## 选项 + +```markdown +If standard input is a terminal, redirect it from /dev/null +If standard output is a terminal, append output to 'nohup.out' if possible,'$HOME/nohup.out' otherwise +If standard error is a terminal, redirect it to standard output +To save output to FILE, use 'nohup COMMAND > FILE +``` + +## 实例 + +```bash +nohup command > myout.file 2>&1 & + +``` diff --git a/Py3Scripts/NumberToChinese.py b/Py3Scripts/NumberToChinese.py index 491f385..4dd79e1 100644 --- a/Py3Scripts/NumberToChinese.py +++ b/Py3Scripts/NumberToChinese.py @@ -1,44 +1,45 @@ -def digital_to_chinese(digital): - str_digital = str(digital) - chinese = {'1': '壹', '2': '贰', '3': '叁', '4': '肆', '5': '伍', '6': '陆', '7': '柒', '8': '捌', '9': '玖', '0': '零'} - chinese2 = ['拾', '佰', '仟', '万', '厘', '分', '角'] - jiao = '' - bs = str_digital.split('.') - yuan = bs[0] - if len(bs) > 1: - jiao = bs[1] - r_yuan = [i for i in reversed(yuan)] - count = 0 - for i in range(len(yuan)): - if i == 0: - r_yuan[i] += '圆' - continue - r_yuan[i] += chinese2[count] - count += 1 - if count == 4: - count = 0 - chinese2[3] = '亿' - - s_jiao = [i for i in jiao][:3] # 去掉小于厘之后的 - - j_count = -1 - for i in range(len(s_jiao)): - s_jiao[i] += chinese2[j_count] - j_count -= 1 - last = [i for i in reversed(r_yuan)] + s_jiao - - last_str = ''.join(last) - print(str_digital) - print(last_str) - for i in range(len(last_str)): - digital = last_str[i] - if digital in chinese: - last_str = last_str.replace(digital, chinese[digital]) - print(last_str) - return last_str - - -number = float(input("输入需要转换的数字:")) - -if __name__ == '__main__': - digital_to_chinese(number) +def digital_to_chinese(digital): + str_digital = str(digital) + chinese = {'1': '壹', '2': '贰', '3': '叁', '4': '肆', '5': '伍', '6': '陆', '7': '柒', '8': '捌', '9': '玖', '0': '零'} + chinese2 = ['拾', '佰', '仟', '万', '厘', '分', '角'] + jiao = '' + bs = str_digital.split('.') + yuan = bs[0] + if len(bs) > 1: + jiao = bs[1] + r_yuan = [i for i in reversed(yuan)] + count = 0 + for i in range(len(yuan)): + if i == 0: + r_yuan[i] += '圆' + continue + r_yuan[i] += chinese2[count] + count += 1 + if count == 4: + count = 0 + chinese2[3] = '亿' + + s_jiao = [i for i in jiao][:3] # 去掉小于厘之后的 + + j_count = -1 + for i in range(len(s_jiao)): + s_jiao[i] += chinese2[j_count] + j_count -= 1 + last = [i for i in reversed(r_yuan)] + s_jiao + + last_str = ''.join(last) + print(str_digital) + print(last_str) + for i in range(len(last_str)): + digital = last_str[i] + if digital in chinese: + last_str = last_str.replace(digital, chinese[digital]) + print(last_str) + return last_str + + +# number = float(input("输入需要转换的数字:")) +number = float(4650) + +if __name__ == '__main__': + digital_to_chinese(number) diff --git a/Py3Scripts/py_extract_data.py b/Py3Scripts/py_extract_data.py new file mode 100644 index 0000000..34993ee --- /dev/null +++ b/Py3Scripts/py_extract_data.py @@ -0,0 +1,117 @@ +import re +import csv +import urllib.request +from urllib.request import urlopen, Request +from bs4 import BeautifulSoup +import xlrd +import time + +dots = [] + + +def read_excel_file(): + loc = "dots.xls" + wb = xlrd.open_workbook(loc) + sheet = wb.sheet_by_index(0) + sheet.cell_value(0, 0) + for i in range(1, 5): + dot = str(sheet.cell_value(i, 0)).replace(".0", "") + dots.append(dot) + + +def crawl_data(url): + req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) + html = urlopen(req).read() + bs = BeautifulSoup(html, "html.parser") + bold_texts = bs.find_all("b") + for b in bold_texts: + try: + date = ( + re.search( + "The information below reflects the content of the FMCSA management information systems as of(.*).", + b.get_text(strip=True, separator=" "), + ) + .group(1) + .strip() + ) + if len(date) > 11: + date = date.split(".", 1)[0] + print(date) + except AttributeError: + pass + + information = bs.find("center").get_text(strip=True, separator=" ") + + operating = re.search("Operating Status:(.*)Out", information).group(1).strip() + legal_name = re.search("Legal Name:(.*)DBA", information).group(1).strip() + physical_address = ( + re.search("Physical Address:(.*)Phone", information).group(1).strip() + ) + mailing_address = ( + re.search("Mailing Address:(.*)USDOT", information).group(1).strip() + ) + usdot_address = ( + re.search("USDOT Number:(.*)State Carrier ID Number", information) + .group(1) + .strip() + ) + power_units = re.search("Power Units:(.*)Drivers", information).group(1).strip() + drivers = re.search("Drivers:(.*)MCS-150 Form Date", information).group(1).strip() + + write_csv( + date, + operating, + legal_name, + physical_address, + mailing_address, + usdot_address, + power_units, + drivers, + ) + + +def write_csv( + date, + operating, + legal_name, + physical_address, + mailing_address, + usdot_address, + power_units, + drivers, +): + with open( + usdot_address + ".csv", mode="w", newline="", encoding="utf-8" + ) as csv_file: + fieldnames = [ + "Date", + "Operating Status", + "Legal_Name", + "Physical Address", + "Mailing Address", + "Power Units", + "Drivers", + ] + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + writer.writerow( + { + "Date": date, + "Operating Status": operating, + "Legal_Name": legal_name, + "Physical Address": physical_address, + "Mailing Address": mailing_address, + "Power Units": power_units, + "Drivers": drivers, + } + ) + + +read_excel_file() +print(dots) +for dot in dots: + crawl_data( + "https://safer.fmcsa.dot.gov/query.asp?searchtype=ANY&query_type=queryCarrierSnapshot&query_param=USDOT&query_string=" + + dot + ) + time.sleep(5) diff --git a/Py3Scripts/py_multithread.py b/Py3Scripts/py_multithread.py new file mode 100644 index 0000000..b171a80 --- /dev/null +++ b/Py3Scripts/py_multithread.py @@ -0,0 +1,55 @@ +# Python多线程例子来示例加线程锁 +# 1。使用线程定义一个子类。线程类 +# 2。实例化子类并触发线程 +# 3。在线程的运行方法中实现锁 + +import threading +import datetime + +exitFlag = 0 + + +class myThread(threading.Thread): + def __init__(self, name, counter): + threading.Thread.__init__(self) + self.threadID = counter + self.name = name + self.counter = counter + + def run(self): + print("\n开始 " + self.name) + # Acquire lock to synchronize thread + threadLock.acquire() + print_date(self.name, self.counter) + # Release lock for the next thread + threadLock.release() + print("退出 " + self.name) + + +def print_date(threadName, counter): + datefields = [] + today = datetime.date.today() + datefields.append(today) + print("{}[{}]: {}".format(threadName, counter, datefields[0])) + + +threadLock = threading.Lock() +threads = [] + +# Create new threads +thread1 = myThread("线程", 1) +thread2 = myThread("线程", 2) + +# Start new Threads +thread1.start() +thread2.start() + +# Add threads to thread list +threads.append(thread1) +threads.append(thread2) + +# Wait for all threads to complete +for thread in threads: + thread.join() + +print("\n退出程序!!!") diff --git a/Py3Scripts/pymongo_example.py b/Py3Scripts/pymongo_example.py index 9e6de48..31996c5 100644 --- a/Py3Scripts/pymongo_example.py +++ b/Py3Scripts/pymongo_example.py @@ -39,6 +39,10 @@ collection = db.arc_AdminConf # 以product_id升序创建索引 # create_index = collection.create_index([('product_id', pymongo.ASCENDING)], unique=True) +# 打印集合索引 +for index in collection.list_indexes(): + pprint(index) + # 打印集合索引信息 # pprint(sorted(list(collection.index_information()))) diff --git a/Py3Scripts/selenium_examples.py b/Py3Scripts/selenium_examples.py index 8c2ec58..40e6360 100644 --- a/Py3Scripts/selenium_examples.py +++ b/Py3Scripts/selenium_examples.py @@ -171,7 +171,7 @@ WebDriver都将延迟 driver.get() 的响应或 driver.navigate().to() 的调用 # normal默认加载策略 # WebDriver等待整个页面的加载,设置为normal时,WebDriver保持等待直到返回load事件 options = Options() -options.page_load_strategy = 'normal' +options.page_load_strategy = "normal" browser = webdriver.Chrome(options=options) browser.get("xxx.com") browser.quit() @@ -179,7 +179,7 @@ browser.quit() # eager加载策略 # WebDriver保持等待并直到完全加载并解析了html文件,忽略css样式表、图片和subframes的加载 # 设置为eager时,保持等待直到返回DOMContentLoaded事件 -options.page_load_strategy = 'eager' +options.page_load_strategy = "eager" # none加载策略 # WebDriver仅等待至初始页面下载完成 @@ -200,7 +200,7 @@ search_box = search_src.find_element_by_name("q") search_box.send_keys("searchconent") # 从父元素的上下文查找匹配子webelement的列表 -element = browser.find_element_by_tag_name('div') +element = browser.find_element_by_tag_name("div") sub_elements = element.find_element_by_tag_name("p") for ele in sub_elements: print(ele.text) @@ -252,25 +252,27 @@ webdriver.ActionChains(browser).move_to_element(gmailLink).perform() xOffset = 100 yOffset = 100 # 将鼠标移动到指定坐标位置,可移到窗口之外 -webdriver.ActionChains(browser).move_by_offset(xOffset,yOffset).perform() +webdriver.ActionChains(browser).move_by_offset(xOffset, yOffset).perform() # 在一个元素点击并按住,然后移到另一个元素 sourceEle = driver.find_element_by_id("draggable") -targetEle = driver.find_element_by_id("droppable") +targetEle = driver.find_element_by_id("droppable") # 鼠标从sourceEle移动到targetEle元素 -webdriver.ActionChains(browser).drag_and_drop(sourceEle,targetEle).perform() +webdriver.ActionChains(browser).drag_and_drop(sourceEle, targetEle).perform() # 在一个元素点击并按住,然后移动一定的偏移量 targetEleXOffset = targetEle.location.get("x") targetEleYOffset = targetEle.location.get("y") -webdriver.ActionChains(browser).drag_and_drop_by_offset(sourceEle, targetEleXOffset, targetEleYOffset).perform() +webdriver.ActionChains(browser).drag_and_drop_by_offset( + sourceEle, targetEleXOffset, targetEleYOffset +).perform() # 释放按下的鼠标左键,如果webelement移动了,将自动释放在给定webelement上按下的鼠标左键 webdriver.ActionChains(browser).release().perform() # 添加cookies # 常用于将cookie添加到当前访问的上下文中. 添加Cookie仅接受一组已定义的可序列化JSON对象 -browser.get('xx.com') +browser.get("xx.com") browser.add_cookie({"name": "key", "value": "value"}) # 获取cookie @@ -285,9 +287,9 @@ browser.delete_all_cookies() # Lax,将Cookie sameSite属性设置为Lax时,该Cookie将与第三方网站发起的GET请求一起发送 # 目前此功能已嵌入chrome80+,适用于selenium4+ -driver.add_cookie({"name": "foo", "value": "value", 'sameSite': 'Strict'}) -driver.add_cookie({"name": "foo1", "value": "value", 'sameSite': 'Lax'}) -cookie1 = driver.get_cookie('foo') -cookie2 = driver.get_cookie('foo1') +driver.add_cookie({"name": "foo", "value": "value", "sameSite": "Strict"}) +driver.add_cookie({"name": "foo1", "value": "value", "sameSite": "Lax"}) +cookie1 = driver.get_cookie("foo") +cookie2 = driver.get_cookie("foo1") print(cookie1) print(cookie2) diff --git a/Py3Scripts/selenium_upwork_data.py b/Py3Scripts/selenium_upwork_data.py new file mode 100644 index 0000000..d412141 --- /dev/null +++ b/Py3Scripts/selenium_upwork_data.py @@ -0,0 +1,127 @@ +import time +import datetime +import re +import xlwt +from xlwt import Workbook +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait + + +class Bolagsverket: + def __init__(self): + self.bot = webdriver.Firefox( + executable_path="E:/geckodriver" + ) + + def navigate_and_crawl(self): + bot = self.bot + bot.get("https://poit.bolagsverket.se/poit/PublikPoitIn.do") + time.sleep(5) + bot.find_element_by_id("nav1-2").click() + time.sleep(5) + bot.find_element_by_tag_name("form").find_element_by_tag_name("a").click() + time.sleep(5) + + search_form = bot.find_element_by_tag_name("form") + search_form.find_element_by_xpath( + "//select[@id='tidsperiod']/option[text()='Annan period']" + ).click() + wait = WebDriverWait(bot, 10) + input_from = wait.until( + EC.element_to_be_clickable((By.XPATH, "//input[@id='from']")) + ) + input_from.send_keys("2019-09-23") + # input_from.send_keys(str(datetime.date.today()-datetime.timedelta(1))) + input_to = wait.until( + EC.element_to_be_clickable((By.XPATH, "//input[@id='tom']")) + ) + input_to.send_keys("2019-09-24") + # input_to.send_keys(str(datetime.date.today())) + time.sleep(5) + + amnesomrade = wait.until( + EC.element_to_be_clickable((By.XPATH, "//select[@id='amnesomrade']")) + ) + amnesomrade.find_element_by_xpath( + "//select[@id='amnesomrade']/option[text()='Bolagsverkets registreringar']" + ).click() + time.sleep(5) + kungorelserubrik = wait.until( + EC.element_to_be_clickable((By.XPATH, "//select[@id='kungorelserubrik']")) + ) + kungorelserubrik.find_element_by_xpath( + "//select[@id='kungorelserubrik']/option[text()='Aktiebolagsregistret']" + ).click() + time.sleep(5) + underrubrik = wait.until( + EC.element_to_be_clickable((By.XPATH, "//select[@id='underrubrik']")) + ) + underrubrik.find_element_by_xpath( + "//select[@id='underrubrik']/option[text()='Nyregistreringar']" + ).click() + + # Search Button + button_sok = wait.until( + EC.element_to_be_clickable((By.XPATH, "//input[@id='SokKungorelse']")) + ) + button_sok.click() + time.sleep(5) + + number_of_pages = bot.find_element_by_xpath( + "//div[@class='gotopagediv']/em[@class='gotopagebuttons']" + ).text.split("av", 1)[1] + number_of_pages.strip().replace(" ", "") + + number_of_results = bot.find_elements_by_xpath("//table/tbody/tr") + + wb = Workbook() + for page in range(int(number_of_pages)): + sheet = wb.add_sheet("Sheet" + str(page)) + style = xlwt.easyxf("font: bold 1") + sheet.write(0, 0, "Post Address", style) + sheet.write(0, 1, "Bildat", style) + sheet.write(0, 2, "Foretagsnamn", style) + sheet.write(0, 3, "Email", style) + + for i in range(len(number_of_results)): + result = bot.find_elements_by_xpath("//table/tbody/tr")[i] + link = result.find_element_by_tag_name("a") + bot.execute_script("arguments[0].click();", link) + time.sleep(5) + + information = [bot.find_element_by_class_name("kungtext").text] + try: + postaddress = re.search("Postadress:(.*),", information[0]) + sheet.write(i + 1, 0, str(postaddress.group(1))) + bildat = re.search("Bildat:(.*)\n", information[0]) + sheet.write(i + 1, 1, str(bildat.group(1))) + foretagsnamn = re.search("Företagsnamn:(.*)\n", information[0]) + sheet.write(i + 1, 2, str(foretagsnamn.group(1))) + email = re.search("E-post:(.*)\n", information[0]) + sheet.write(i + 1, 3, str(email.group(1))) + print( + postaddress.group(1), + bildat.group(1), + foretagsnamn.group(1), + email.group(1), + ) + except AttributeError as e: + print("Email is null") + sheet.write(i + 1, 3, "null") + pass + bot.back() + time.sleep(5) + wb.save("emails.xls") + print("Going to next page ...") + button_next = wait.until( + EC.element_to_be_clickable((By.XPATH, "//input/[@id='movenextTop']")) + ) + button_next.click() + time.sleep(5) + + +bot = Bolagsverket() +bot.navigate_and_crawl()