From d58a6880f3e5beb16878d44a672399859d9b9406 Mon Sep 17 00:00:00 2001 From: Hugo Slepicka Date: Wed, 9 Mar 2022 18:03:51 -0800 Subject: [PATCH 1/3] FIX: Cleanup capabilities and options upon stop_browser. Fixes #40. --- botcity/web/bot.py | 5 +++++ mkdocs.yml | 2 ++ 2 files changed, 7 insertions(+) diff --git a/botcity/web/bot.py b/botcity/web/bot.py index 3564453..7d1fc97 100644 --- a/botcity/web/bot.py +++ b/botcity/web/bot.py @@ -233,11 +233,16 @@ def check_driver(): def stop_browser(self): """ Stops the Chrome browser and clean up the User Data Directory. + + Warning: + After invoking this method, you will need to reassign your custom options and capabilities. """ if not self._driver: return self._driver.close() self._driver.quit() + self.options = None + self.capabilities = None self._driver = None def set_screen_resolution(self, width=None, height=None): diff --git a/mkdocs.yml b/mkdocs.yml index a68dc92..fb8bcf0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -32,6 +32,8 @@ theme: name: Switch to light mode markdown_extensions: + - admonition + - pymdownx.details - pymdownx.highlight - pymdownx.superfences From 5af2de34ebc32d3c55b77f16e07c44dae1f487fd Mon Sep 17 00:00:00 2001 From: Hugo Slepicka Date: Mon, 14 Mar 2022 11:00:07 -0700 Subject: [PATCH 2/3] Enhancements - Add `wait_for_new_page`, a context manager to help with operations that open a new tab or window. - `find_element` and `find_elements` now have `waiting_time` and `ensure_visible` parameters for enhanced experience and ease of use making it similar to the `find` methods for computer-vision. - Add `wait_for_stale_element` which waits until an element is modified on the DOM. - Add `wait_for_element_visibility` which waits until an element becomes visible or invisible. - Add new `parsers` module with `table_to_dict` which allow users to extract structured data from HTML tables into a list of dictionaries. - Add `element_as_select` to ease the handling of select elements on forms. Documentation - New section `Handling Data` showcasing how you can easily extract data from web pages. - New section `Interacting with Forms` showcasing how to deal with select elements and file input elements. --- botcity/web/__init__.py | 2 + botcity/web/bot.py | 95 +++++++++++++++++++++++++++++++++++++---- botcity/web/parsers.py | 89 ++++++++++++++++++++++++++++++++++++++ botcity/web/util.py | 24 ++++++++++- docs/forms.md | 44 +++++++++++++++++++ docs/intro.md | 34 +++++++++------ docs/parsers.md | 67 +++++++++++++++++++++++++++++ mkdocs.yml | 2 + 8 files changed, 336 insertions(+), 21 deletions(-) create mode 100644 botcity/web/parsers.py create mode 100644 docs/forms.md create mode 100644 docs/parsers.md diff --git a/botcity/web/__init__.py b/botcity/web/__init__.py index bacac8a..aefc889 100644 --- a/botcity/web/__init__.py +++ b/botcity/web/__init__.py @@ -1,4 +1,6 @@ from .bot import WebBot, Browser, BROWSER_CONFIGS, By # noqa: F401, F403 +from .parsers import table_to_dict, data_from_row, sanitize_header # noqa: F401, F403 +from .util import element_as_select # noqa: F401, F403 from botcity.web._version import get_versions __version__ = get_versions()['version'] diff --git a/botcity/web/bot.py b/botcity/web/bot.py index 7d1fc97..d38f956 100644 --- a/botcity/web/bot.py +++ b/botcity/web/bot.py @@ -10,6 +10,7 @@ import shutil import time from typing import List +from contextlib import contextmanager from botcity.base import BaseBot, State from botcity.base.utils import only_if_element @@ -21,7 +22,8 @@ from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.remote.webelement import WebElement -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.wait import WebDriverWait, TimeoutException, NoSuchElementException +from selenium.webdriver.support import expected_conditions as EC from . import config, cv2find from .browsers import BROWSER_CONFIGS, Browser @@ -859,6 +861,26 @@ def browse(self, url): """ self.navigate_to(url) + @contextmanager + def wait_for_new_page(self, waiting_time=10000, activate=True): + """Context manager to wait for a new page to load and activate it. + + Args: + waiting_time (int, optional): The maximum waiting time. Defaults to 10000. + activate (bool, optional): Whether or not to activate the new page. Defaults to True. + + """ + tabs = self.get_tabs() + yield + start_time = time.time() + while tabs == self.get_tabs(): + elapsed_time = (time.time() - start_time) * 1000 + if elapsed_time > waiting_time: + return None + time.sleep(0.1) + if activate: + self.activate_tab(self.get_tabs()[-1]) + def execute_javascript(self, code): """ Execute the given javascript code. @@ -1037,15 +1059,18 @@ def wait_for_downloads(self, timeout: int = 120000): wait_method = BROWSER_CONFIGS.get(self.browser).get("wait_for_downloads") # waits for all the files to be completed - WebDriverWait(self._driver, timeout/1000, 1).until(wait_method) + WebDriverWait(self._driver, timeout/1000.0, 1).until(wait_method) - def find_elements(self, selector: str, by: By = By.CSS_SELECTOR) -> List[WebElement]: + def find_elements(self, selector: str, by: By = By.CSS_SELECTOR, waiting_time=10000, ensure_visible: bool = True) -> List[WebElement]: """Find elements using the specified selector with selector type specified by `by`. Args: selector (str): The selector string to be used. by (str, optional): Selector type. Defaults to By.CSS_SELECTOR. [See more](https://selenium-python.readthedocs.io/api.html#selenium.webdriver.common.by.By) + waiting_time (int, optional): Maximum wait time (ms) to search for a hit. + Defaults to 10000ms (10s). + ensure_visible (bool, optional): Whether to wait for the element to be visible. Defaults to True. Returns: List[WebElement]: List of elements found. @@ -1059,9 +1084,20 @@ def find_elements(self, selector: str, by: By = By.CSS_SELECTOR) -> List[WebElem ... ``` """ - return self._driver.find_elements(by, selector) + condition = EC.visibility_of_all_elements_located if ensure_visible else EC.presence_of_all_elements_located + + try: + elements = WebDriverWait( + self._driver, timeout=waiting_time / 1000.0 + ).until( + condition((by, selector)) + ) + return elements + except (TimeoutException, NoSuchElementException) as ex: + print("Exception on find_elements", ex) + return None - def find_element(self, selector: str, by: str = By.CSS_SELECTOR) -> WebElement: + def find_element(self, selector: str, by: str = By.CSS_SELECTOR, waiting_time=10000, ensure_visible: bool = False, ensure_clickable: bool = False) -> WebElement: """Find an element using the specified selector with selector type specified by `by`. If more than one element is found, the first instance is returned. @@ -1069,6 +1105,11 @@ def find_element(self, selector: str, by: str = By.CSS_SELECTOR) -> WebElement: selector (str): The selector string to be used. by (str, optional): Selector type. Defaults to By.CSS_SELECTOR. [See more](https://selenium-python.readthedocs.io/api.html#selenium.webdriver.common.by.By) + waiting_time (int, optional): Maximum wait time (ms) to search for a hit. + Defaults to 10000ms (10s). + ensure_visible (bool, optional): Whether to wait for the element to be visible. Defaults to False. + ensure_clickable (bool, optional): Whether to wait for the element to be clickable. Defaults to False. + If True, `ensure_clickable` takes precedence over `ensure_visible`. Returns: WebElement: The element found. @@ -1084,9 +1125,47 @@ def find_element(self, selector: str, by: str = By.CSS_SELECTOR) -> WebElement: ... ``` """ - out = self.find_elements(selector=selector, by=by) - if out: - return out[0] + condition = EC.visibility_of_element_located if ensure_visible else EC.presence_of_element_located + condition = EC.element_to_be_clickable if ensure_clickable else condition + + try: + element = WebDriverWait( + self._driver, timeout=waiting_time/1000.0 + ).until( + condition((by, selector)) + ) + return element + except (TimeoutException, NoSuchElementException): + return None + + def wait_for_stale_element(self, element: WebElement, timeout: int = 10000): + """ + Wait until the WebElement element becomes stale (outdated). + + Args: + element (WebElement): The element to monitor for staleness. + timeout (int, optional): Timeout in millis. Defaults to 120000. + """ + try: + WebDriverWait(self._driver, timeout=timeout/1000.0).until(EC.staleness_of(element)) + except (TimeoutException, NoSuchElementException): + pass + + def wait_for_element_visibility(self, element: WebElement, visible: bool = True, waiting_time=10000): + """Wait for the element to be visible or hidden. + + Args: + element (WebElement): The element to wait for. + visible (bool, optional): Whether to wait for the element to be visible. Defaults to True. + waiting_time (int, optional): Maximum wait time (ms) to search for a hit. + Defaults to 10000ms (10s). + """ + if visible: + wait_method = EC.visibility_of + else: + wait_method = EC.invisibility_of_element + + WebDriverWait(self._driver, timeout=waiting_time/1000.0).until(wait_method(element)) def set_file_input_element(self, element: WebElement, filepath: str): """Configure the filepath for upload in a file element. diff --git a/botcity/web/parsers.py b/botcity/web/parsers.py new file mode 100644 index 0000000..e70a2c8 --- /dev/null +++ b/botcity/web/parsers.py @@ -0,0 +1,89 @@ +import collections +import string +from typing import Dict, List +from selenium.webdriver.remote.webelement import WebElement + + +def data_from_row(row: WebElement, cell_tag="td") -> List[str]: + """Extract data from a row and return it as a list. + + Args: + row (WebElement): The row element. + cell_tag (str, optional): The HTML tag associated with the row cells. Defaults to "td". + + Returns: + list: List of strings with the contents. + """ + return [ + col.text for col in row.find_elements_by_tag_name(cell_tag) + ] + + +def sanitize_header(labels: List[str]): + """Sanitize header labels.""" + # Handle Treat Empty Header + for idx, label in enumerate(labels): + if label.strip(): + # make it lowercase + label = label.lower() + + # remove punctuations + label = ''.join([l for l in label if l not in string.punctuation]) + + # replace spaces with underscores + label = label.replace(" ", "_") + else: + label = f"col_{idx}" + labels[idx] = label + + # Deduplicate by adding _1, _2, _3 to repeated labels + counts = {k: v for k, v in collections.Counter(labels).items() if v > 1} + for i in reversed(range(len(labels))): + item = labels[i] + if item in counts and counts[item]: + labels[i] = f"{item}_{counts[item]}" + counts[item] -= 1 + + return labels + + +def table_to_dict(table: WebElement, has_header: bool = True, skip_rows: int = 0, header_tag: str = "th") -> List[Dict]: + """Convert a table WebElement to a dict of lists. + + Args: + table (WebElement): The table element. + has_header (bool, optional): Whether or not to parse a header. Defaults to True. + skip_rows (int, optional): Number of rows to skip from the top. Defaults to 0. + header_tag (str, optional): The HTML tag associated with the header cell. Defaults to "th". + + Returns: + list: List with dict for each row. + """ + + # Collect all rows from table + rows = table.find_elements_by_tag_name("tr") + + # Skip rows if informed + if skip_rows: + rows = rows[skip_rows:] + + # Parse header labels + if has_header: + # Read header labels + labels = data_from_row(rows[0], cell_tag=header_tag) + # Sanitize headers + labels = sanitize_header(labels) + # Skip the header + rows = rows[1:] + else: + # Make up header labels + num_cols = len(rows[0].find_elements_by_tag_name("td")) + labels = [f"col_{i}" for i in range(num_cols)] + + # Assemble output dictionary + out_list = [] + for row in rows: + row_data = data_from_row(row) + out_list.append(dict(zip(labels, row_data))) + + return out_list diff --git a/botcity/web/util.py b/botcity/web/util.py index 96041d0..5eb468b 100644 --- a/botcity/web/util.py +++ b/botcity/web/util.py @@ -1,9 +1,31 @@ import shutil +import tempfile +from selenium.webdriver.remote.webelement import WebElement +from selenium.webdriver.support.select import Select -def cleanup_temp_dir(temp_dir): + +def cleanup_temp_dir(temp_dir: tempfile.TemporaryDirectory) -> None: + """ + Deletes the temporary directory and all its contents. + + Args: + temp_dir (tempfile.TemporaryDirectory): The temporary directory to delete. + """ if temp_dir: try: temp_dir.cleanup() except OSError: shutil.rmtree(temp_dir.name, ignore_errors=True) + + +def element_as_select(element: WebElement) -> Select: + """Wraps a WebElement in a Select object. + + Args: + element (WebElement): The element to wrap. + + Returns: + Select: The Select object. + """ + return Select(element) diff --git a/docs/forms.md b/docs/forms.md new file mode 100644 index 0000000..d3d5bec --- /dev/null +++ b/docs/forms.md @@ -0,0 +1,44 @@ +# Interacting with Forms + +When dealing with forms, we often need to fill in the form and submit it. + +While most of the operations are trivial, there are some things that are not such as selecting a select element or dealing with file uploads. + +For that we developed some utilitary functions that you can use. + +## Select Element + +After grabing the element via the `find_element` or `find_elements` functions, we can use the `element_as_select` to convert it into a `Select` object. + +::: botcity.web.util.element_as_select + +### Example usage + +```python +# Import the function +from botcity.web.util import element_as_select +... +# Fetch the select element +element = self.find_element("select", By.TAG_NAME) +# Convert the element into a Select object +select_element = element_as_select(element) +# Select the option based on visible text +select_element.select_by_visible_text("Option 1") +... +``` + +## File Upload + +After grabing the element via the `find_element` or `find_elements` functions, we can use the `set_file_input_element` to assign the file path to the element. + +### Example usage + +```python +from botcity.web import By +... +# Find the input element of type `file` using CSS_SELECTOR. +elem = self.find_element("body > form > input[type=file]", By.CSS_SELECTOR) +# Configure the file to be used when processing the upload +self.set_file_input_element(elem, "./test.txt") +... +``` diff --git a/docs/intro.md b/docs/intro.md index 40492c3..1c9731d 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -63,14 +63,14 @@ class Bot(WebBot): # Changes the Browser to Firefox self.browser = Browser.FIREFOX - + # For Chrome # self.browser = Browser.CHROME - + ... ``` -From the snippet above the key takeaway is the `self.browser` piece in which we set it to one of the values +From the snippet above the key takeaway is the `self.browser` piece in which we set it to one of the values from the `Browser` *enum* as mentioned before. #### Defining the WebDriver Path @@ -90,7 +90,7 @@ class Bot(WebBot): # Inform the WebDriver path for Google Chrome's chromedriver self.driver_path = "/home/username/drivers/chromedriver" - + ... ``` @@ -101,20 +101,20 @@ By default the browsers are launched with a set of curated options which we pick Before getting into how to customize those details let's walk through some of the assumptions and details which are covered by the `default options`. -- **Headless Execution**: Depending on the `headless` property set on your Bot class we pick the +- **Headless Execution**: Depending on the `headless` property set on your Bot class we pick the proper configuration to launch the browser in the desired mode. - + - **Downloads Folder Path**: By default we save all downloaded files on the Desktop folder. - **User Profile**: By default we generate a temporary directory (which is later erased) to be used as the profile directory. This procedure ensure that every execution starts with a clean browser session and things such as cookies and stored passwords or certificates from one execution won't interfere with the others. - -A handful of other options are also set and they can be inspected on the source code for each browser on the + +A handful of other options are also set and they can be inspected on the source code for each browser on the `botcity.web.browsers` module. -If you really need to customize the options you can do so via the `options` property. You can fetch +If you really need to customize the options you can do so via the `options` property. You can fetch the `default options` curated by BotCity and make your changes or start your options from scratch. In the following snippet we will cover how to build on top of the existing options. @@ -140,13 +140,13 @@ class Bot(WebBot): download_folder_path=self.download_folder_path, user_data_dir=None # Informing None here will generate a temporary directory ) - + # Add your customized argument def_options.add_argument("") - + # Update the options to use the customized Options. self.options = def_options - + ... ``` @@ -156,6 +156,16 @@ Every supported browser will have an exclusive module with curated default optio This function takes in arguments to define the mode of execution (headless or not), default download folder path and user data/profile directory. +### Parsing Data + +When extracting data from websites we often need to read information from tables and other common sources. +The `Handling Data` section highlights how we can do that and which functions are available to be used. + +### Forms + +It is common while navigating websites to have to fill up forms and submit them. +To facilitate this we have a `Interacting with Forms` section which highlights the available functions to be used. + ### Next Steps Check our examples and experiment with the API. diff --git a/docs/parsers.md b/docs/parsers.md new file mode 100644 index 0000000..df6084c --- /dev/null +++ b/docs/parsers.md @@ -0,0 +1,67 @@ +# Handling Data + +## Tables + +To extract data from tables we offer an utility function which parses the table and returns a list of dictionaries. + +::: botcity.web.parsers.table_to_dict + +### Usage Example + +Let's fetch data from the example table available at the [W3Schools website](https://www.w3schools.com/html/html_tables.asp). + + +```python +class Bot(WebBot): + def action(self, execution=None): + self.browse("https://www.w3schools.com/html/html_tables.asp") + + # Fetch the table + table = self.find_element("table", By.TAG_NAME) + + # Parse the table + parsed_table = table_to_dict(table) + + # Print the parsed table + print(parsed_table) + + # Close the browser and free resources + self.stop_browser() +``` + +The output should look like this: + +```python +[ + { + 'company': 'Alfreds Futterkiste', + 'contact': 'Maria Anders', + 'country': 'Germany' + }, + { + 'company': 'Centro comercial Moctezuma', + 'contact': 'Francisco Chang', + 'country': 'Mexico' + }, + { + 'company': 'Ernst Handel', + 'contact': 'Roland Mendel', + 'country': 'Austria' + }, + { + 'company': 'Island Trading', + 'contact': 'Helen Bennett', + 'country': 'UK' + }, + { + 'company': 'Laughing Bacchus Winecellars', + 'contact': 'Yoshi Tannamuri', + 'country': 'Canada' + }, + { + 'company': 'Magazzini Alimentari Riuniti', + 'contact': 'Giovanni Rovelli', + 'country': 'Italy' + } +] +``` \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index fb8bcf0..30a68a6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -7,6 +7,8 @@ nav: - Home: index.md - Getting Started: intro.md - Framework: bot.md + - Handling Data: parsers.md + - Interacting with Forms: forms.md - Browsers: browsers.md theme: From 5339c5fb5105e93db3998bc53ffdba6a82eda97f Mon Sep 17 00:00:00 2001 From: Hugo Slepicka Date: Mon, 14 Mar 2022 11:06:40 -0700 Subject: [PATCH 3/3] STY: Changes to abide by PEP8. --- botcity/web/__init__.py | 2 +- botcity/web/bot.py | 11 ++++++++--- botcity/web/parsers.py | 5 +++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/botcity/web/__init__.py b/botcity/web/__init__.py index aefc889..ca888b1 100644 --- a/botcity/web/__init__.py +++ b/botcity/web/__init__.py @@ -1,6 +1,6 @@ from .bot import WebBot, Browser, BROWSER_CONFIGS, By # noqa: F401, F403 from .parsers import table_to_dict, data_from_row, sanitize_header # noqa: F401, F403 -from .util import element_as_select # noqa: F401, F403 +from .util import element_as_select # noqa: F401, F403 from botcity.web._version import get_versions __version__ = get_versions()['version'] diff --git a/botcity/web/bot.py b/botcity/web/bot.py index d38f956..6b9face 100644 --- a/botcity/web/bot.py +++ b/botcity/web/bot.py @@ -1061,7 +1061,8 @@ def wait_for_downloads(self, timeout: int = 120000): # waits for all the files to be completed WebDriverWait(self._driver, timeout/1000.0, 1).until(wait_method) - def find_elements(self, selector: str, by: By = By.CSS_SELECTOR, waiting_time=10000, ensure_visible: bool = True) -> List[WebElement]: + def find_elements(self, selector: str, by: By = By.CSS_SELECTOR, + waiting_time=10000, ensure_visible: bool = True) -> List[WebElement]: """Find elements using the specified selector with selector type specified by `by`. Args: @@ -1084,7 +1085,10 @@ def find_elements(self, selector: str, by: By = By.CSS_SELECTOR, waiting_time=10 ... ``` """ - condition = EC.visibility_of_all_elements_located if ensure_visible else EC.presence_of_all_elements_located + if ensure_visible: + condition = EC.visibility_of_all_elements_located + else: + condition = EC.presence_of_all_elements_located try: elements = WebDriverWait( @@ -1097,7 +1101,8 @@ def find_elements(self, selector: str, by: By = By.CSS_SELECTOR, waiting_time=10 print("Exception on find_elements", ex) return None - def find_element(self, selector: str, by: str = By.CSS_SELECTOR, waiting_time=10000, ensure_visible: bool = False, ensure_clickable: bool = False) -> WebElement: + def find_element(self, selector: str, by: str = By.CSS_SELECTOR, waiting_time=10000, + ensure_visible: bool = False, ensure_clickable: bool = False) -> WebElement: """Find an element using the specified selector with selector type specified by `by`. If more than one element is found, the first instance is returned. diff --git a/botcity/web/parsers.py b/botcity/web/parsers.py index e70a2c8..6228cc7 100644 --- a/botcity/web/parsers.py +++ b/botcity/web/parsers.py @@ -28,7 +28,7 @@ def sanitize_header(labels: List[str]): label = label.lower() # remove punctuations - label = ''.join([l for l in label if l not in string.punctuation]) + label = ''.join([l for l in label if l not in string.punctuation]) # noqa: E741 # replace spaces with underscores label = label.replace(" ", "_") @@ -47,7 +47,8 @@ def sanitize_header(labels: List[str]): return labels -def table_to_dict(table: WebElement, has_header: bool = True, skip_rows: int = 0, header_tag: str = "th") -> List[Dict]: +def table_to_dict(table: WebElement, has_header: bool = True, + skip_rows: int = 0, header_tag: str = "th") -> List[Dict]: """Convert a table WebElement to a dict of lists. Args: