From d8ec0fe1e5ebac21afe68c0563cc54ba3253e751 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Feb 2026 07:58:29 +0000 Subject: [PATCH] Remove underscore prefix from function name for better notebook style Co-authored-by: leestott <2511341+leestott@users.noreply.github.com> --- .../01-defining-data-science/notebook.ipynb | 39 +------------------ .../solution/notebook.ipynb | 2 +- 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/1-Introduction/01-defining-data-science/notebook.ipynb b/1-Introduction/01-defining-data-science/notebook.ipynb index 7b564420..35a1af9b 100644 --- a/1-Introduction/01-defining-data-science/notebook.ipynb +++ b/1-Introduction/01-defining-data-science/notebook.ipynb @@ -91,44 +91,7 @@ "cell_type": "code", "execution_count": 64, "source": [ - "from bs4 import BeautifulSoup\r\n", - "\r\n", - "# Parse the HTML content\r\n", - "soup = BeautifulSoup(text, 'html.parser')\r\n", - "\r\n", - "# Extract only the main article content from Wikipedia\r\n", - "# Wikipedia uses 'mw-parser-output' class for the main article content\r\n", - "content = soup.find('div', class_='mw-parser-output')\r\n", - "\r\n", - "def _clean_wikipedia_content(content_node):\r\n", - " \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n", - " # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n", - " selectors = [\r\n", - " '.mw-jump-link',\r\n", - " '.navbox',\r\n", - " '.reflist',\r\n", - " 'sup.reference',\r\n", - " '.mw-editsection',\r\n", - " '.hatnote',\r\n", - " '.metadata',\r\n", - " '.infobox',\r\n", - " '#toc',\r\n", - " '.toc',\r\n", - " '.sidebar',\r\n", - " ]\r\n", - " for selector in selectors:\r\n", - " for el in content_node.select(selector):\r\n", - " el.decompose()\r\n", - "\r\n", - "if content:\r\n", - " # Clean the content node to better approximate article text only.\r\n", - " _clean_wikipedia_content(content)\r\n", - " text = content.get_text(separator=' ', strip=True)\r\n", - " print(text[:1000])\r\n", - "else:\r\n", - " print(\"Could not find main content. Using full page text.\")\r\n", - " text = soup.get_text(separator=' ', strip=True)\r\n", - " print(text[:1000])" + "from bs4 import BeautifulSoup\r\n\r\n# Parse the HTML content\r\nsoup = BeautifulSoup(text, 'html.parser')\r\n\r\n# Extract only the main article content from Wikipedia\r\n# Wikipedia uses 'mw-parser-output' class for the main article content\r\ncontent = soup.find('div', class_='mw-parser-output')\r\n\r\ndef clean_wikipedia_content(content_node):\r\n \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n selectors = [\r\n '.mw-jump-link',\r\n '.navbox',\r\n '.reflist',\r\n 'sup.reference',\r\n '.mw-editsection',\r\n '.hatnote',\r\n '.metadata',\r\n '.infobox',\r\n '#toc',\r\n '.toc',\r\n '.sidebar',\r\n ]\r\n for selector in selectors:\r\n for el in content_node.select(selector):\r\n el.decompose()\r\n\r\nif content:\r\n # Clean the content node to better approximate article text only.\r\n clean_wikipedia_content(content)\r\n text = content.get_text(separator=' ', strip=True)\r\n print(text[:1000])\r\nelse:\r\n print(\"Could not find main content. Using full page text.\")\r\n text = soup.get_text(separator=' ', strip=True)\r\n print(text[:1000])" ], "outputs": [ { diff --git a/1-Introduction/01-defining-data-science/solution/notebook.ipynb b/1-Introduction/01-defining-data-science/solution/notebook.ipynb index 92e616b5..75e45a91 100644 --- a/1-Introduction/01-defining-data-science/solution/notebook.ipynb +++ b/1-Introduction/01-defining-data-science/solution/notebook.ipynb @@ -94,7 +94,7 @@ "cell_type": "code", "execution_count": 4, "source": [ - "from bs4 import BeautifulSoup\r\n\r\n# Parse the HTML content\r\nsoup = BeautifulSoup(text, 'html.parser')\r\n\r\n# Extract only the main article content from Wikipedia\r\n# Wikipedia uses 'mw-parser-output' class for the main article content\r\ncontent = soup.find('div', class_='mw-parser-output')\r\n\r\ndef _clean_wikipedia_content(content_node):\r\n \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n selectors = [\r\n '.mw-jump-link',\r\n '.navbox',\r\n '.reflist',\r\n 'sup.reference',\r\n '.mw-editsection',\r\n '.hatnote',\r\n '.metadata',\r\n '.infobox',\r\n '#toc',\r\n '.toc',\r\n '.sidebar',\r\n ]\r\n for selector in selectors:\r\n for el in content_node.select(selector):\r\n el.decompose()\r\n\r\nif content:\r\n # Clean the content node to better approximate article text only.\r\n _clean_wikipedia_content(content)\r\n text = content.get_text(separator=' ', strip=True)\r\n print(text[:1000])\r\nelse:\r\n print(\"Could not find main content. Using full page text.\")\r\n text = soup.get_text(separator=' ', strip=True)\r\n print(text[:1000])" + "from bs4 import BeautifulSoup\r\n\r\n# Parse the HTML content\r\nsoup = BeautifulSoup(text, 'html.parser')\r\n\r\n# Extract only the main article content from Wikipedia\r\n# Wikipedia uses 'mw-parser-output' class for the main article content\r\ncontent = soup.find('div', class_='mw-parser-output')\r\n\r\ndef clean_wikipedia_content(content_node):\r\n \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n selectors = [\r\n '.mw-jump-link',\r\n '.navbox',\r\n '.reflist',\r\n 'sup.reference',\r\n '.mw-editsection',\r\n '.hatnote',\r\n '.metadata',\r\n '.infobox',\r\n '#toc',\r\n '.toc',\r\n '.sidebar',\r\n ]\r\n for selector in selectors:\r\n for el in content_node.select(selector):\r\n el.decompose()\r\n\r\nif content:\r\n # Clean the content node to better approximate article text only.\r\n clean_wikipedia_content(content)\r\n text = content.get_text(separator=' ', strip=True)\r\n print(text[:1000])\r\nelse:\r\n print(\"Could not find main content. Using full page text.\")\r\n text = soup.get_text(separator=' ', strip=True)\r\n print(text[:1000])" ], "outputs": [ {