From 66494022db8a81969f628db951f58c796c0425e0 Mon Sep 17 00:00:00 2001 From: Drugsosos <44712637+Drugsosos@users.noreply.github.com> Date: Wed, 20 Jul 2022 01:50:03 +0300 Subject: [PATCH] added async playwright --- webdriver/common.py | 8 + webdriver/playwright.py | 331 ++++++++++++++++++++++++++++------------ webdriver/pyppeteer.py | 74 +++++---- 3 files changed, 282 insertions(+), 131 deletions(-) diff --git a/webdriver/common.py b/webdriver/common.py index 3c70a9f..c1c9f35 100644 --- a/webdriver/common.py +++ b/webdriver/common.py @@ -65,3 +65,11 @@ class ExceptionDecorator: if func: exceptor = exceptor(func) return exceptor + + +# Lots of tabs - lots of memory +# chunk needed to minimize memory required +def chunks(lst, n): + """Yield successive n-sized chunks from list.""" + for i in range(0, len(lst), n): + yield lst[i:i + n] diff --git a/webdriver/playwright.py b/webdriver/playwright.py index f1934d9..87716a0 100644 --- a/webdriver/playwright.py +++ b/webdriver/playwright.py @@ -1,15 +1,21 @@ -from playwright.async_api import async_playwright, ViewportSize -from playwright.async_api import Browser, Playwright -from rich.progress import track +from asyncio import as_completed + +from playwright.async_api import async_playwright, TimeoutError +from playwright.async_api import Browser, Playwright, Page, BrowserContext, Locator from pathlib import Path -import translators as ts from utils import settings from utils.console import print_step, print_substep +import translators as ts +from rich.progress import track + from attr import attrs, attrib -from attr.validators import instance_of, optional +from attr.validators import instance_of +from typing import Dict, Optional + +from webdriver.common import ExceptionDecorator, chunks -from typing import Dict, Optional, Union +catch_exception = ExceptionDecorator(default_exception=TimeoutError).catch_exception @attrs @@ -23,15 +29,14 @@ class Browser: validator=instance_of(dict), default={ # 9x21 to see long posts - "defaultViewport": { - "width": 500, - "height": 1200, - }, + "width": 500, + "height": 1200, }, kw_only=True, ) playwright: Playwright browser: Browser + context: BrowserContext async def get_browser( self, @@ -41,30 +46,98 @@ class Browser: """ self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch() + self.context = await self.browser.new_context(viewport=self.default_Viewport) async def close_browser( self, ) -> None: """ - Closes Pyppeteer browser + Closes Playwright stuff """ + await self.context.close() await self.browser.close() await self.playwright.stop() +class Flaky: + """ + All methods decorated with function catching default exceptions and writing logs + """ + + @staticmethod + @catch_exception + def find_element( + query: str, + page_instance: Page, + options: Optional[dict] = None, + ) -> Locator: + return page_instance.locator(query, **options) if options else page_instance.locator(query) + + @catch_exception + async def click( + self, + page_instance: Optional[Page] = None, + query: Optional[str] = None, + options: Optional[dict] = None, + *, + find_options: Optional[dict] = None, + element: Optional[Locator] = None, + ) -> None: + if element: + await element.click(**options) if options else element.click() + else: + results = ( + self.find_element(query, page_instance, **find_options) + if find_options + else self.find_element(query, page_instance) + ) + await results.click(**options) if options else await results.click() + + @catch_exception + async def screenshot( + self, + page_instance: Optional[Page] = None, + query: Optional[str] = None, + options: Optional[dict] = None, + *, + find_options: Optional[dict] = None, + element: Optional[Locator] = None, + ) -> None: + if element: + await element.screenshot(**options) if options else await element.screenshot() + else: + results = ( + self.find_element(query, page_instance, **find_options) + if find_options + else self.find_element(query, page_instance) + ) + await results.screenshot(**options) if options else await results.screenshot() + + @attrs(auto_attribs=True) -class RedditScreenshot(Browser): +class RedditScreenshot(Flaky, Browser): + """ + Args: + reddit_object (Dict): Reddit object received from reddit/subreddit.py + screenshot_idx (int): List with indexes of voiced comments + story_mode (bool): If submission is a story takes screenshot of the story """ - Args: - reddit_object (Dict): Reddit object received from reddit/subreddit.py - screenshot_idx (int): List with indexes of voiced comments - """ reddit_object: dict screenshot_idx: list + story_mode: Optional[bool] = attrib( + validator=instance_of(bool), + default=False, + kw_only=True + ) + + def __attrs_post_init__( + self + ): + self.post_lang: Optional[bool] = settings.config["reddit"]["thread"]["post_lang"] async def __dark_theme( self, - page_instance: PageCls, + page_instance: Page, ) -> None: """ Enables dark theme in Reddit @@ -75,128 +148,190 @@ class RedditScreenshot(Browser): await self.click( page_instance, - "//*[contains(@class, 'header-user-dropdown')]", - {"timeout": 5000}, + "header-user-dropdown", ) # It's normal not to find it, sometimes there is none :shrug: await self.click( page_instance, - "//*[contains(text(), 'Settings')]/ancestor::button[1]", - {"timeout": 5000}, + ":nth-match(button) >> 'Settings'", ) await self.click( page_instance, - "//*[contains(text(), 'Dark Mode')]/ancestor::button[1]", - {"timeout": 5000}, + ":nth-match(button) >> 'Dark Mode'", ) # Closes settings await self.click( page_instance, - "//*[contains(@class, 'header-user-dropdown')]", + "header-user-dropdown" + ) + + async def __close_nsfw( + self, + page_instance: Page, + ) -> None: + """ + Closes NSFW stuff + + Args: + page_instance: Instance of main page + """ + + print_substep("Post is NSFW. You are spicy...") + + # Triggers indirectly reload + await self.click( + page_instance, + 'button:has-text("Yes")', {"timeout": 5000}, ) + # Await indirect reload + await page_instance.wait_for_load_state() + await self.click( + page_instance, + 'button:has-text("Click to see nsfw")', + {"timeout": 5000}, + ) -storymode = False + async def __collect_comment( + self, + comment_obj: dict, + filename_idx: int, + ) -> None: + """ + Makes a screenshot of the comment + Args: + comment_obj: prew comment object + filename_idx: index for the filename + """ + comment_page = await self.context.new_page() + await comment_page.goto(f'https://reddit.com{comment_obj["comment_url"]}') + + # Translates submission' comment + if self.post_lang: + comment_tl = ts.google( + comment_obj["comment_body"], + to_language=self.post_lang, + ) + await comment_page.evaluate( + f"document.querySelector('#t1_{comment_obj['comment_id']} > div:nth-child(2) " + f'> div > div[data-testid="comment"] > div\').textContent = {comment_tl}', + ) -def download_screenshots_of_reddit_posts(reddit_object: dict, screenshot_num: int): - """Downloads screenshots of reddit posts as seen on the web. Downloads to assets/temp/png + await self.screenshot( + comment_page, + f"id=t1_{comment_obj['comment_id']}", + {"path": f"assets/temp/png/comment_{filename_idx}.png"}, + ) - Args: - reddit_object (Dict): Reddit object received from reddit/subreddit.py - screenshot_num (int): Number of screenshots to download - """ - print_step("Downloading screenshots of reddit posts...") + # WIP TODO test it + async def __collect_story( + self, + main_page: Page, + ): + # Translates submission text + if self.post_lang: + story_tl = ts.google( + self.reddit_object["thread_post"], + to_language=self.post_lang, + ) + split_story_tl = story_tl.split('\n') + await main_page.evaluate( + # Find all elements + 'var elements = document.querySelectorAll(`[data-test-id="post-content"]' + ' > [data-click-id="text"] > div > p`);' + # Set array with translated text + f"var texts = {split_story_tl};" + # Map 2 arrays together + "var text_map = texts.map(function(e, i) { return [e, elements[i]]; });" + # Change text on the page + "for (i = 0; i < text_map.length; ++i) { text_map[i][1].textContent = text_map[i][0] ; };" + ) + + await self.screenshot( + main_page, + '[data-click-id="text"]', + {"path": "assets/temp/png/story_content.png"}, + ) - # ! Make sure the reddit screenshots folder exists - Path("assets/temp/png").mkdir(parents=True, exist_ok=True) + async def download( + self, + ): + """ + Downloads screenshots of reddit posts as seen on the web. Downloads to assets/temp/png + """ + print_step("Downloading screenshots of reddit posts...") - with sync_playwright() as p: print_substep("Launching Headless Browser...") + await self.get_browser() - browser = p.chromium.launch() - context = browser.new_context() + # ! Make sure the reddit screenshots folder exists + Path("assets/temp/png").mkdir(parents=True, exist_ok=True) - if settings.config["settings"]["theme"] == "dark": - cookie_file = open("./video_creation/data/cookie-dark-mode.json", encoding="utf-8") - else: - cookie_file = open("./video_creation/data/cookie-light-mode.json", encoding="utf-8") - cookies = json.load(cookie_file) - context.add_cookies(cookies) # load preference cookies # Get the thread screenshot - page = context.new_page() - page.goto(reddit_object["thread_url"], timeout=0) - page.set_viewport_size(ViewportSize(width=1920, height=1080)) - if page.locator('[data-testid="content-gate"]').is_visible(): - # This means the post is NSFW and requires to click the proceed button. - - print_substep("Post is NSFW. You are spicy...") - page.locator('[data-testid="content-gate"] button').click() - page.wait_for_load_state() # Wait for page to fully load + reddit_main = await self.browser.new_page() + # noinspection Duplicates + await reddit_main.goto(self.reddit_object["thread_url"]) - if page.locator('[data-click-id="text"] button').is_visible(): - page.locator( - '[data-click-id="text"] button' - ).click() # Remove "Click to see nsfw" Button in Screenshot + if settings.config["settings"]["theme"] == "dark": + await self.__dark_theme(reddit_main) - # translate code + if self.reddit_object["is_nsfw"]: + # This means the post is NSFW and requires to click the proceed button. + await self.__close_nsfw(reddit_main) + # Translates submission title if settings.config["reddit"]["thread"]["post_lang"]: print_substep("Translating post...") texts_in_tl = ts.google( - reddit_object["thread_title"], + self.reddit_object["thread_title"], to_language=settings.config["reddit"]["thread"]["post_lang"], ) - page.evaluate( - "tl_content => document.querySelector('[data-test-id=\"post-content\"] > div:nth-child(3) > div > div').textContent = tl_content", - texts_in_tl, + await reddit_main.evaluate( + "document.querySelector('[data-test-id=\"post-content\"] > div:nth-child(3) > div > " + f"div').textContent = {texts_in_tl}", ) else: print_substep("Skipping translation...") - page.locator('[data-test-id="post-content"]').screenshot(path="assets/temp/png/title.png") + # No sense to move it in common.py + # noinspection Duplicates + async_tasks_primary = ( + [ + self.__collect_comment(self.reddit_object["comments"][idx], idx) for idx in + self.screenshot_idx + ] + if not self.story_mode + else [ + self.__collect_story(reddit_main) + ] + ) - if storymode: - page.locator('[data-click-id="text"]').screenshot( - path="assets/temp/png/story_content.png" + async_tasks_primary.append( + self.screenshot( + reddit_main, + f"id=t3_{self.reddit_object['thread_id']}", + {"path": "assets/temp/png/title.png"}, ) - else: - for idx, comment in enumerate( - track(reddit_object["comments"], "Downloading screenshots...") + ) + + for idx, chunked_tasks in enumerate( + [chunk for chunk in chunks(async_tasks_primary, 10)], + start=1, + ): + chunk_list = async_tasks_primary.__len__() // 10 + (1 if async_tasks_primary.__len__() % 10 != 0 else 0) + for task in track( + as_completed(chunked_tasks), + description=f"Downloading comments: Chunk {idx}/{chunk_list}", + total=chunked_tasks.__len__(), ): - # Stop if we have reached the screenshot_num - if idx >= screenshot_num: - break - - if page.locator('[data-testid="content-gate"]').is_visible(): - page.locator('[data-testid="content-gate"] button').click() - - page.goto(f'https://reddit.com{comment["comment_url"]}', timeout=0) - - # translate code - - if settings.config["reddit"]["thread"]["post_lang"]: - comment_tl = ts.google( - comment["comment_body"], - to_language=settings.config["reddit"]["thread"]["post_lang"], - ) - page.evaluate( - '([tl_content, tl_id]) => document.querySelector(`#t1_${tl_id} > div:nth-child(2) > div > div[data-testid="comment"] > div`).textContent = tl_content', - [comment_tl, comment["comment_id"]], - ) - try: - page.locator(f"#t1_{comment['comment_id']}").screenshot( - path=f"assets/temp/png/comment_{idx}.png" - ) - except TimeoutError: - del reddit_object["comments"] - screenshot_num += 1 - print("TimeoutError: Skipping screenshot...") - continue - print_substep("Screenshots downloaded Successfully.", style="bold green") + await task + + print_substep("Comments downloaded Successfully.", style="bold green") + await self.close_browser() diff --git a/webdriver/pyppeteer.py b/webdriver/pyppeteer.py index b9b409b..9c630f6 100644 --- a/webdriver/pyppeteer.py +++ b/webdriver/pyppeteer.py @@ -7,18 +7,16 @@ from pyppeteer.element_handle import ElementHandle as ElementHandleCls from pyppeteer.errors import TimeoutError as BrowserTimeoutError from pathlib import Path -from typing import Dict from utils import settings - +from utils.console import print_step, print_substep from rich.progress import track import translators as ts -from utils.console import print_step, print_substep from attr import attrs, attrib -from attr.validators import instance_of, optional +from attr.validators import instance_of from typing import Optional -from webdriver.common import ExceptionDecorator +from webdriver.common import ExceptionDecorator, chunks catch_exception = ExceptionDecorator(default_exception=BrowserTimeoutError).catch_exception @@ -100,8 +98,9 @@ class Wait: self, page_instance: Optional[PageCls] = None, xpath: Optional[str] = None, - find_options: Optional[dict] = None, options: Optional[dict] = None, + *, + find_options: Optional[dict] = None, el: Optional[ElementHandleCls] = None, ) -> None: """ @@ -127,6 +126,7 @@ class Wait: page_instance: Optional[PageCls] = None, xpath: Optional[str] = None, options: Optional[dict] = None, + *, find_options: Optional[dict] = None, el: Optional[ElementHandleCls] = None, ) -> None: @@ -154,14 +154,21 @@ class RedditScreenshot(Browser, Wait): Args: reddit_object (Dict): Reddit object received from reddit/subreddit.py screenshot_idx (int): List with indexes of voiced comments + story_mode (bool): If submission is a story takes screenshot of the story """ reddit_object: dict screenshot_idx: list story_mode: Optional[bool] = attrib( validator=instance_of(bool), default=False, + kw_only=True ) + def __attrs_post_init__( + self, + ): + self.post_lang: Optional[bool] = settings.config["reddit"]["thread"]["post_lang"] + async def __dark_theme( self, page_instance: PageCls, @@ -176,33 +183,40 @@ class RedditScreenshot(Browser, Wait): await self.click( page_instance, "//*[contains(@class, 'header-user-dropdown')]", - {"timeout": 5000}, + find_options={"timeout": 5000}, ) # It's normal not to find it, sometimes there is none :shrug: await self.click( page_instance, "//*[contains(text(), 'Settings')]/ancestor::button[1]", - {"timeout": 5000}, + find_options={"timeout": 5000}, ) await self.click( page_instance, "//*[contains(text(), 'Dark Mode')]/ancestor::button[1]", - {"timeout": 5000}, + find_options={"timeout": 5000}, ) # Closes settings await self.click( page_instance, "//*[contains(@class, 'header-user-dropdown')]", - {"timeout": 5000}, + find_options={"timeout": 5000}, ) async def __close_nsfw( self, - page_instance: PageCls + page_instance: PageCls, ) -> None: + """ + Closes NSFW stuff + + Args: + page_instance: Instance of main page + """ + from asyncio import ensure_future print_substep("Post is NSFW. You are spicy...") @@ -213,17 +227,17 @@ class RedditScreenshot(Browser, Wait): await self.click( page_instance, '//button[text()="Yes"]', - {"timeout": 5000}, + find_options={"timeout": 5000}, ) # Await reload await navigation - await (await self.find_xpath( + await self.click( page_instance, '//button[text()="Click to see nsfw"]', - {"timeout": 5000}, - )).click() + find_options={"timeout": 5000}, + ) async def __collect_comment( self, @@ -241,19 +255,19 @@ class RedditScreenshot(Browser, Wait): await comment_page.goto(f'https://reddit.com{comment_obj["comment_url"]}') # Translates submission' comment - if settings.config["reddit"]["thread"]["post_lang"]: + if self.post_lang: comment_tl = ts.google( comment_obj["comment_body"], - to_language=settings.config["reddit"]["thread"]["post_lang"], + to_language=self.post_lang, ) await comment_page.evaluate( - f'([tl_content, tl_id]) => document.querySelector(`#t1_{comment_obj["comment_id"]} > div:nth-child(2) ' - f'> div > div[data-testid="comment"] > div`).textContent = {comment_tl}', + f"([tl_content, tl_id]) => document.querySelector('#t1_{comment_obj['comment_id']} > div:nth-child(2) " + f'> div > div[data-testid="comment"] > div\').textContent = {comment_tl}', ) await self.screenshot( comment_page, - f'//*[contains(@id, \'t1_{comment_obj["comment_id"]}\')]', + f"//*[contains(@id, 't1_{comment_obj['comment_id']}')]", {"path": f"assets/temp/png/comment_{filename_idx}.png"}, ) @@ -261,13 +275,12 @@ class RedditScreenshot(Browser, Wait): async def __collect_story( self, main_page: PageCls, - ): # Translates submission text - if settings.config["reddit"]["thread"]["post_lang"]: + if self.post_lang: story_tl = ts.google( self.reddit_object["thread_post"], - to_language=settings.config["reddit"]["thread"]["post_lang"], + to_language=self.post_lang, ) split_story_tl = story_tl.split('\n') await main_page.evaluate( @@ -304,6 +317,7 @@ class RedditScreenshot(Browser, Wait): # Get the thread screenshot reddit_main = await self.browser.newPage() + # noinspection Duplicates await reddit_main.goto(self.reddit_object["thread_url"]) if settings.config["settings"]["theme"] == "dark": @@ -322,13 +336,14 @@ class RedditScreenshot(Browser, Wait): ) await reddit_main.evaluate( - "tl_content => document.querySelector('[data-test-id=\"post-content\"] > div:nth-child(3) > div > " - "div').textContent = tl_content", - texts_in_tl, + "document.querySelector('[data-test-id=\"post-content\"] > div:nth-child(3) > div > " + f"div').textContent = {texts_in_tl}", ) else: print_substep("Skipping translation...") + # No sense to move it in common.py + # noinspection Duplicates async_tasks_primary = ( [ self.__collect_comment(self.reddit_object["comments"][idx], idx) for idx in @@ -348,13 +363,6 @@ class RedditScreenshot(Browser, Wait): ) ) - # Lots of tabs - lots of memory - # chunk needed to minimize memory required - def chunks(lst, n): - """Yield successive n-sized chunks from list.""" - for i in range(0, len(lst), n): - yield lst[i:i + n] - for idx, chunked_tasks in enumerate( [chunk for chunk in chunks(async_tasks_primary, 10)], start=1,