added async playwright

pull/963/head
Drugsosos 2 years ago
parent 27577d0da6
commit 66494022db
No known key found for this signature in database
GPG Key ID: 8E35176FE617E28D

@ -65,3 +65,11 @@ class ExceptionDecorator:
if func: if func:
exceptor = exceptor(func) exceptor = exceptor(func)
return exceptor return exceptor
# Lots of tabs - lots of memory
# chunk needed to minimize memory required
def chunks(lst, n):
"""Yield successive n-sized chunks from list."""
for i in range(0, len(lst), n):
yield lst[i:i + n]

@ -1,15 +1,21 @@
from playwright.async_api import async_playwright, ViewportSize from asyncio import as_completed
from playwright.async_api import Browser, Playwright
from rich.progress import track from playwright.async_api import async_playwright, TimeoutError
from playwright.async_api import Browser, Playwright, Page, BrowserContext, Locator
from pathlib import Path from pathlib import Path
import translators as ts
from utils import settings from utils import settings
from utils.console import print_step, print_substep from utils.console import print_step, print_substep
import translators as ts
from rich.progress import track
from attr import attrs, attrib from attr import attrs, attrib
from attr.validators import instance_of, optional from attr.validators import instance_of
from typing import Dict, Optional
from typing import Dict, Optional, Union from webdriver.common import ExceptionDecorator, chunks
catch_exception = ExceptionDecorator(default_exception=TimeoutError).catch_exception
@attrs @attrs
@ -23,15 +29,14 @@ class Browser:
validator=instance_of(dict), validator=instance_of(dict),
default={ default={
# 9x21 to see long posts # 9x21 to see long posts
"defaultViewport": {
"width": 500, "width": 500,
"height": 1200, "height": 1200,
}, },
},
kw_only=True, kw_only=True,
) )
playwright: Playwright playwright: Playwright
browser: Browser browser: Browser
context: BrowserContext
async def get_browser( async def get_browser(
self, self,
@ -41,30 +46,98 @@ class Browser:
""" """
self.playwright = await async_playwright().start() self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch() self.browser = await self.playwright.chromium.launch()
self.context = await self.browser.new_context(viewport=self.default_Viewport)
async def close_browser( async def close_browser(
self, self,
) -> None: ) -> None:
""" """
Closes Pyppeteer browser Closes Playwright stuff
""" """
await self.context.close()
await self.browser.close() await self.browser.close()
await self.playwright.stop() await self.playwright.stop()
class Flaky:
"""
All methods decorated with function catching default exceptions and writing logs
"""
@staticmethod
@catch_exception
def find_element(
query: str,
page_instance: Page,
options: Optional[dict] = None,
) -> Locator:
return page_instance.locator(query, **options) if options else page_instance.locator(query)
@catch_exception
async def click(
self,
page_instance: Optional[Page] = None,
query: Optional[str] = None,
options: Optional[dict] = None,
*,
find_options: Optional[dict] = None,
element: Optional[Locator] = None,
) -> None:
if element:
await element.click(**options) if options else element.click()
else:
results = (
self.find_element(query, page_instance, **find_options)
if find_options
else self.find_element(query, page_instance)
)
await results.click(**options) if options else await results.click()
@catch_exception
async def screenshot(
self,
page_instance: Optional[Page] = None,
query: Optional[str] = None,
options: Optional[dict] = None,
*,
find_options: Optional[dict] = None,
element: Optional[Locator] = None,
) -> None:
if element:
await element.screenshot(**options) if options else await element.screenshot()
else:
results = (
self.find_element(query, page_instance, **find_options)
if find_options
else self.find_element(query, page_instance)
)
await results.screenshot(**options) if options else await results.screenshot()
@attrs(auto_attribs=True) @attrs(auto_attribs=True)
class RedditScreenshot(Browser): class RedditScreenshot(Flaky, Browser):
""" """
Args: Args:
reddit_object (Dict): Reddit object received from reddit/subreddit.py reddit_object (Dict): Reddit object received from reddit/subreddit.py
screenshot_idx (int): List with indexes of voiced comments screenshot_idx (int): List with indexes of voiced comments
story_mode (bool): If submission is a story takes screenshot of the story
""" """
reddit_object: dict reddit_object: dict
screenshot_idx: list screenshot_idx: list
story_mode: Optional[bool] = attrib(
validator=instance_of(bool),
default=False,
kw_only=True
)
def __attrs_post_init__(
self
):
self.post_lang: Optional[bool] = settings.config["reddit"]["thread"]["post_lang"]
async def __dark_theme( async def __dark_theme(
self, self,
page_instance: PageCls, page_instance: Page,
) -> None: ) -> None:
""" """
Enables dark theme in Reddit Enables dark theme in Reddit
@ -75,128 +148,190 @@ class RedditScreenshot(Browser):
await self.click( await self.click(
page_instance, page_instance,
"//*[contains(@class, 'header-user-dropdown')]", "header-user-dropdown",
{"timeout": 5000},
) )
# It's normal not to find it, sometimes there is none :shrug: # It's normal not to find it, sometimes there is none :shrug:
await self.click( await self.click(
page_instance, page_instance,
"//*[contains(text(), 'Settings')]/ancestor::button[1]", ":nth-match(button) >> 'Settings'",
{"timeout": 5000},
) )
await self.click( await self.click(
page_instance, page_instance,
"//*[contains(text(), 'Dark Mode')]/ancestor::button[1]", ":nth-match(button) >> 'Dark Mode'",
{"timeout": 5000},
) )
# Closes settings # Closes settings
await self.click( await self.click(
page_instance, page_instance,
"//*[contains(@class, 'header-user-dropdown')]", "header-user-dropdown"
{"timeout": 5000},
) )
async def __close_nsfw(
self,
page_instance: Page,
) -> None:
"""
Closes NSFW stuff
Args:
page_instance: Instance of main page
"""
storymode = False print_substep("Post is NSFW. You are spicy...")
# Triggers indirectly reload
await self.click(
page_instance,
'button:has-text("Yes")',
{"timeout": 5000},
)
def download_screenshots_of_reddit_posts(reddit_object: dict, screenshot_num: int): # Await indirect reload
"""Downloads screenshots of reddit posts as seen on the web. Downloads to assets/temp/png await page_instance.wait_for_load_state()
await self.click(
page_instance,
'button:has-text("Click to see nsfw")',
{"timeout": 5000},
)
async def __collect_comment(
self,
comment_obj: dict,
filename_idx: int,
) -> None:
"""
Makes a screenshot of the comment
Args: Args:
reddit_object (Dict): Reddit object received from reddit/subreddit.py comment_obj: prew comment object
screenshot_num (int): Number of screenshots to download filename_idx: index for the filename
""" """
print_step("Downloading screenshots of reddit posts...") comment_page = await self.context.new_page()
await comment_page.goto(f'https://reddit.com{comment_obj["comment_url"]}')
# ! Make sure the reddit screenshots folder exists # Translates submission' comment
Path("assets/temp/png").mkdir(parents=True, exist_ok=True) if self.post_lang:
comment_tl = ts.google(
comment_obj["comment_body"],
to_language=self.post_lang,
)
await comment_page.evaluate(
f"document.querySelector('#t1_{comment_obj['comment_id']} > div:nth-child(2) "
f'> div > div[data-testid="comment"] > div\').textContent = {comment_tl}',
)
await self.screenshot(
comment_page,
f"id=t1_{comment_obj['comment_id']}",
{"path": f"assets/temp/png/comment_{filename_idx}.png"},
)
# WIP TODO test it
async def __collect_story(
self,
main_page: Page,
):
# Translates submission text
if self.post_lang:
story_tl = ts.google(
self.reddit_object["thread_post"],
to_language=self.post_lang,
)
split_story_tl = story_tl.split('\n')
await main_page.evaluate(
# Find all elements
'var elements = document.querySelectorAll(`[data-test-id="post-content"]'
' > [data-click-id="text"] > div > p`);'
# Set array with translated text
f"var texts = {split_story_tl};"
# Map 2 arrays together
"var text_map = texts.map(function(e, i) { return [e, elements[i]]; });"
# Change text on the page
"for (i = 0; i < text_map.length; ++i) { text_map[i][1].textContent = text_map[i][0] ; };"
)
await self.screenshot(
main_page,
'[data-click-id="text"]',
{"path": "assets/temp/png/story_content.png"},
)
async def download(
self,
):
"""
Downloads screenshots of reddit posts as seen on the web. Downloads to assets/temp/png
"""
print_step("Downloading screenshots of reddit posts...")
with sync_playwright() as p:
print_substep("Launching Headless Browser...") print_substep("Launching Headless Browser...")
await self.get_browser()
browser = p.chromium.launch() # ! Make sure the reddit screenshots folder exists
context = browser.new_context() Path("assets/temp/png").mkdir(parents=True, exist_ok=True)
if settings.config["settings"]["theme"] == "dark":
cookie_file = open("./video_creation/data/cookie-dark-mode.json", encoding="utf-8")
else:
cookie_file = open("./video_creation/data/cookie-light-mode.json", encoding="utf-8")
cookies = json.load(cookie_file)
context.add_cookies(cookies) # load preference cookies
# Get the thread screenshot # Get the thread screenshot
page = context.new_page() reddit_main = await self.browser.new_page()
page.goto(reddit_object["thread_url"], timeout=0) # noinspection Duplicates
page.set_viewport_size(ViewportSize(width=1920, height=1080)) await reddit_main.goto(self.reddit_object["thread_url"])
if page.locator('[data-testid="content-gate"]').is_visible():
# This means the post is NSFW and requires to click the proceed button.
print_substep("Post is NSFW. You are spicy...")
page.locator('[data-testid="content-gate"] button').click()
page.wait_for_load_state() # Wait for page to fully load
if page.locator('[data-click-id="text"] button').is_visible(): if settings.config["settings"]["theme"] == "dark":
page.locator( await self.__dark_theme(reddit_main)
'[data-click-id="text"] button'
).click() # Remove "Click to see nsfw" Button in Screenshot
# translate code if self.reddit_object["is_nsfw"]:
# This means the post is NSFW and requires to click the proceed button.
await self.__close_nsfw(reddit_main)
# Translates submission title
if settings.config["reddit"]["thread"]["post_lang"]: if settings.config["reddit"]["thread"]["post_lang"]:
print_substep("Translating post...") print_substep("Translating post...")
texts_in_tl = ts.google( texts_in_tl = ts.google(
reddit_object["thread_title"], self.reddit_object["thread_title"],
to_language=settings.config["reddit"]["thread"]["post_lang"], to_language=settings.config["reddit"]["thread"]["post_lang"],
) )
page.evaluate( await reddit_main.evaluate(
"tl_content => document.querySelector('[data-test-id=\"post-content\"] > div:nth-child(3) > div > div').textContent = tl_content", "document.querySelector('[data-test-id=\"post-content\"] > div:nth-child(3) > div > "
texts_in_tl, f"div').textContent = {texts_in_tl}",
) )
else: else:
print_substep("Skipping translation...") print_substep("Skipping translation...")
page.locator('[data-test-id="post-content"]').screenshot(path="assets/temp/png/title.png") # No sense to move it in common.py
# noinspection Duplicates
if storymode: async_tasks_primary = (
page.locator('[data-click-id="text"]').screenshot( [
path="assets/temp/png/story_content.png" self.__collect_comment(self.reddit_object["comments"][idx], idx) for idx in
self.screenshot_idx
]
if not self.story_mode
else [
self.__collect_story(reddit_main)
]
) )
else:
for idx, comment in enumerate(
track(reddit_object["comments"], "Downloading screenshots...")
):
# Stop if we have reached the screenshot_num
if idx >= screenshot_num:
break
if page.locator('[data-testid="content-gate"]').is_visible(): async_tasks_primary.append(
page.locator('[data-testid="content-gate"] button').click() self.screenshot(
reddit_main,
page.goto(f'https://reddit.com{comment["comment_url"]}', timeout=0) f"id=t3_{self.reddit_object['thread_id']}",
{"path": "assets/temp/png/title.png"},
# translate code
if settings.config["reddit"]["thread"]["post_lang"]:
comment_tl = ts.google(
comment["comment_body"],
to_language=settings.config["reddit"]["thread"]["post_lang"],
) )
page.evaluate(
'([tl_content, tl_id]) => document.querySelector(`#t1_${tl_id} > div:nth-child(2) > div > div[data-testid="comment"] > div`).textContent = tl_content',
[comment_tl, comment["comment_id"]],
) )
try:
page.locator(f"#t1_{comment['comment_id']}").screenshot( for idx, chunked_tasks in enumerate(
path=f"assets/temp/png/comment_{idx}.png" [chunk for chunk in chunks(async_tasks_primary, 10)],
) start=1,
except TimeoutError: ):
del reddit_object["comments"] chunk_list = async_tasks_primary.__len__() // 10 + (1 if async_tasks_primary.__len__() % 10 != 0 else 0)
screenshot_num += 1 for task in track(
print("TimeoutError: Skipping screenshot...") as_completed(chunked_tasks),
continue description=f"Downloading comments: Chunk {idx}/{chunk_list}",
print_substep("Screenshots downloaded Successfully.", style="bold green") total=chunked_tasks.__len__(),
):
await task
print_substep("Comments downloaded Successfully.", style="bold green")
await self.close_browser()

@ -7,18 +7,16 @@ from pyppeteer.element_handle import ElementHandle as ElementHandleCls
from pyppeteer.errors import TimeoutError as BrowserTimeoutError from pyppeteer.errors import TimeoutError as BrowserTimeoutError
from pathlib import Path from pathlib import Path
from typing import Dict
from utils import settings from utils import settings
from utils.console import print_step, print_substep
from rich.progress import track from rich.progress import track
import translators as ts import translators as ts
from utils.console import print_step, print_substep
from attr import attrs, attrib from attr import attrs, attrib
from attr.validators import instance_of, optional from attr.validators import instance_of
from typing import Optional from typing import Optional
from webdriver.common import ExceptionDecorator from webdriver.common import ExceptionDecorator, chunks
catch_exception = ExceptionDecorator(default_exception=BrowserTimeoutError).catch_exception catch_exception = ExceptionDecorator(default_exception=BrowserTimeoutError).catch_exception
@ -100,8 +98,9 @@ class Wait:
self, self,
page_instance: Optional[PageCls] = None, page_instance: Optional[PageCls] = None,
xpath: Optional[str] = None, xpath: Optional[str] = None,
find_options: Optional[dict] = None,
options: Optional[dict] = None, options: Optional[dict] = None,
*,
find_options: Optional[dict] = None,
el: Optional[ElementHandleCls] = None, el: Optional[ElementHandleCls] = None,
) -> None: ) -> None:
""" """
@ -127,6 +126,7 @@ class Wait:
page_instance: Optional[PageCls] = None, page_instance: Optional[PageCls] = None,
xpath: Optional[str] = None, xpath: Optional[str] = None,
options: Optional[dict] = None, options: Optional[dict] = None,
*,
find_options: Optional[dict] = None, find_options: Optional[dict] = None,
el: Optional[ElementHandleCls] = None, el: Optional[ElementHandleCls] = None,
) -> None: ) -> None:
@ -154,14 +154,21 @@ class RedditScreenshot(Browser, Wait):
Args: Args:
reddit_object (Dict): Reddit object received from reddit/subreddit.py reddit_object (Dict): Reddit object received from reddit/subreddit.py
screenshot_idx (int): List with indexes of voiced comments screenshot_idx (int): List with indexes of voiced comments
story_mode (bool): If submission is a story takes screenshot of the story
""" """
reddit_object: dict reddit_object: dict
screenshot_idx: list screenshot_idx: list
story_mode: Optional[bool] = attrib( story_mode: Optional[bool] = attrib(
validator=instance_of(bool), validator=instance_of(bool),
default=False, default=False,
kw_only=True
) )
def __attrs_post_init__(
self,
):
self.post_lang: Optional[bool] = settings.config["reddit"]["thread"]["post_lang"]
async def __dark_theme( async def __dark_theme(
self, self,
page_instance: PageCls, page_instance: PageCls,
@ -176,33 +183,40 @@ class RedditScreenshot(Browser, Wait):
await self.click( await self.click(
page_instance, page_instance,
"//*[contains(@class, 'header-user-dropdown')]", "//*[contains(@class, 'header-user-dropdown')]",
{"timeout": 5000}, find_options={"timeout": 5000},
) )
# It's normal not to find it, sometimes there is none :shrug: # It's normal not to find it, sometimes there is none :shrug:
await self.click( await self.click(
page_instance, page_instance,
"//*[contains(text(), 'Settings')]/ancestor::button[1]", "//*[contains(text(), 'Settings')]/ancestor::button[1]",
{"timeout": 5000}, find_options={"timeout": 5000},
) )
await self.click( await self.click(
page_instance, page_instance,
"//*[contains(text(), 'Dark Mode')]/ancestor::button[1]", "//*[contains(text(), 'Dark Mode')]/ancestor::button[1]",
{"timeout": 5000}, find_options={"timeout": 5000},
) )
# Closes settings # Closes settings
await self.click( await self.click(
page_instance, page_instance,
"//*[contains(@class, 'header-user-dropdown')]", "//*[contains(@class, 'header-user-dropdown')]",
{"timeout": 5000}, find_options={"timeout": 5000},
) )
async def __close_nsfw( async def __close_nsfw(
self, self,
page_instance: PageCls page_instance: PageCls,
) -> None: ) -> None:
"""
Closes NSFW stuff
Args:
page_instance: Instance of main page
"""
from asyncio import ensure_future from asyncio import ensure_future
print_substep("Post is NSFW. You are spicy...") print_substep("Post is NSFW. You are spicy...")
@ -213,17 +227,17 @@ class RedditScreenshot(Browser, Wait):
await self.click( await self.click(
page_instance, page_instance,
'//button[text()="Yes"]', '//button[text()="Yes"]',
{"timeout": 5000}, find_options={"timeout": 5000},
) )
# Await reload # Await reload
await navigation await navigation
await (await self.find_xpath( await self.click(
page_instance, page_instance,
'//button[text()="Click to see nsfw"]', '//button[text()="Click to see nsfw"]',
{"timeout": 5000}, find_options={"timeout": 5000},
)).click() )
async def __collect_comment( async def __collect_comment(
self, self,
@ -241,19 +255,19 @@ class RedditScreenshot(Browser, Wait):
await comment_page.goto(f'https://reddit.com{comment_obj["comment_url"]}') await comment_page.goto(f'https://reddit.com{comment_obj["comment_url"]}')
# Translates submission' comment # Translates submission' comment
if settings.config["reddit"]["thread"]["post_lang"]: if self.post_lang:
comment_tl = ts.google( comment_tl = ts.google(
comment_obj["comment_body"], comment_obj["comment_body"],
to_language=settings.config["reddit"]["thread"]["post_lang"], to_language=self.post_lang,
) )
await comment_page.evaluate( await comment_page.evaluate(
f'([tl_content, tl_id]) => document.querySelector(`#t1_{comment_obj["comment_id"]} > div:nth-child(2) ' f"([tl_content, tl_id]) => document.querySelector('#t1_{comment_obj['comment_id']} > div:nth-child(2) "
f'> div > div[data-testid="comment"] > div`).textContent = {comment_tl}', f'> div > div[data-testid="comment"] > div\').textContent = {comment_tl}',
) )
await self.screenshot( await self.screenshot(
comment_page, comment_page,
f'//*[contains(@id, \'t1_{comment_obj["comment_id"]}\')]', f"//*[contains(@id, 't1_{comment_obj['comment_id']}')]",
{"path": f"assets/temp/png/comment_{filename_idx}.png"}, {"path": f"assets/temp/png/comment_{filename_idx}.png"},
) )
@ -261,13 +275,12 @@ class RedditScreenshot(Browser, Wait):
async def __collect_story( async def __collect_story(
self, self,
main_page: PageCls, main_page: PageCls,
): ):
# Translates submission text # Translates submission text
if settings.config["reddit"]["thread"]["post_lang"]: if self.post_lang:
story_tl = ts.google( story_tl = ts.google(
self.reddit_object["thread_post"], self.reddit_object["thread_post"],
to_language=settings.config["reddit"]["thread"]["post_lang"], to_language=self.post_lang,
) )
split_story_tl = story_tl.split('\n') split_story_tl = story_tl.split('\n')
await main_page.evaluate( await main_page.evaluate(
@ -304,6 +317,7 @@ class RedditScreenshot(Browser, Wait):
# Get the thread screenshot # Get the thread screenshot
reddit_main = await self.browser.newPage() reddit_main = await self.browser.newPage()
# noinspection Duplicates
await reddit_main.goto(self.reddit_object["thread_url"]) await reddit_main.goto(self.reddit_object["thread_url"])
if settings.config["settings"]["theme"] == "dark": if settings.config["settings"]["theme"] == "dark":
@ -322,13 +336,14 @@ class RedditScreenshot(Browser, Wait):
) )
await reddit_main.evaluate( await reddit_main.evaluate(
"tl_content => document.querySelector('[data-test-id=\"post-content\"] > div:nth-child(3) > div > " "document.querySelector('[data-test-id=\"post-content\"] > div:nth-child(3) > div > "
"div').textContent = tl_content", f"div').textContent = {texts_in_tl}",
texts_in_tl,
) )
else: else:
print_substep("Skipping translation...") print_substep("Skipping translation...")
# No sense to move it in common.py
# noinspection Duplicates
async_tasks_primary = ( async_tasks_primary = (
[ [
self.__collect_comment(self.reddit_object["comments"][idx], idx) for idx in self.__collect_comment(self.reddit_object["comments"][idx], idx) for idx in
@ -348,13 +363,6 @@ class RedditScreenshot(Browser, Wait):
) )
) )
# Lots of tabs - lots of memory
# chunk needed to minimize memory required
def chunks(lst, n):
"""Yield successive n-sized chunks from list."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
for idx, chunked_tasks in enumerate( for idx, chunked_tasks in enumerate(
[chunk for chunk in chunks(async_tasks_primary, 10)], [chunk for chunk in chunks(async_tasks_primary, 10)],
start=1, start=1,

Loading…
Cancel
Save