From 3a341157017f82a9eacf262d65f00b7ccefab60c Mon Sep 17 00:00:00 2001 From: Lee Stott Date: Tue, 17 Feb 2026 07:54:09 +0000 Subject: [PATCH] Update 1-Introduction/01-defining-data-science/notebook.ipynb Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../01-defining-data-science/notebook.ipynb | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/1-Introduction/01-defining-data-science/notebook.ipynb b/1-Introduction/01-defining-data-science/notebook.ipynb index 4648caf0..e571610f 100644 --- a/1-Introduction/01-defining-data-science/notebook.ipynb +++ b/1-Introduction/01-defining-data-science/notebook.ipynb @@ -91,7 +91,44 @@ "cell_type": "code", "execution_count": 64, "source": [ - "from bs4 import BeautifulSoup\r\n\r\n# Parse the HTML content\r\nsoup = BeautifulSoup(text, 'html.parser')\r\n\r\n# Extract only the main article content from Wikipedia\r\n# Wikipedia uses 'mw-parser-output' class for the main article content\r\ncontent = soup.find('div', class_='mw-parser-output')\r\n\r\nif content:\r\n # Get text from the content, excluding navigation, references, etc.\r\n text = content.get_text(separator=' ', strip=True)\r\n print(text[:1000])\r\nelse:\r\n print(\"Could not find main content. Using full page text.\")\r\n text = soup.get_text(separator=' ', strip=True)\r\n print(text[:1000])" + "from bs4 import BeautifulSoup\r\n", + "\r\n", + "# Parse the HTML content\r\n", + "soup = BeautifulSoup(text, 'html.parser')\r\n", + "\r\n", + "# Extract only the main article content from Wikipedia\r\n", + "# Wikipedia uses 'mw-parser-output' class for the main article content\r\n", + "content = soup.find('div', class_='mw-parser-output')\r\n", + "\r\n", + "def _clean_wikipedia_content(content_node):\r\n", + " \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n", + " # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n", + " selectors = [\r\n", + " '.mw-jump-link',\r\n", + " '.navbox',\r\n", + " '.reflist',\r\n", + " 'sup.reference',\r\n", + " '.mw-editsection',\r\n", + " '.hatnote',\r\n", + " '.metadata',\r\n", + " '.infobox',\r\n", + " '#toc',\r\n", + " '.toc',\r\n", + " '.sidebar',\r\n", + " ]\r\n", + " for selector in selectors:\r\n", + " for el in content_node.select(selector):\r\n", + " el.decompose()\r\n", + "\r\n", + "if content:\r\n", + " # Clean the content node to better approximate article text only.\r\n", + " _clean_wikipedia_content(content)\r\n", + " text = content.get_text(separator=' ', strip=True)\r\n", + " print(text[:1000])\r\n", + "else:\r\n", + " print(\"Could not find main content. Using full page text.\")\r\n", + " text = soup.get_text(separator=' ', strip=True)\r\n", + " print(text[:1000])" ], "outputs": [ {