Remove underscore prefix from function name for better notebook style

Co-authored-by: leestott <2511341+leestott@users.noreply.github.com>
copilot/fix-relevant-content-extraction
copilot-swe-agent[bot] 1 week ago
parent c8299dabb2
commit d8ec0fe1e5

@ -91,44 +91,7 @@
"cell_type": "code",
"execution_count": 64,
"source": [
"from bs4 import BeautifulSoup\r\n",
"\r\n",
"# Parse the HTML content\r\n",
"soup = BeautifulSoup(text, 'html.parser')\r\n",
"\r\n",
"# Extract only the main article content from Wikipedia\r\n",
"# Wikipedia uses 'mw-parser-output' class for the main article content\r\n",
"content = soup.find('div', class_='mw-parser-output')\r\n",
"\r\n",
"def _clean_wikipedia_content(content_node):\r\n",
" \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n",
" # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n",
" selectors = [\r\n",
" '.mw-jump-link',\r\n",
" '.navbox',\r\n",
" '.reflist',\r\n",
" 'sup.reference',\r\n",
" '.mw-editsection',\r\n",
" '.hatnote',\r\n",
" '.metadata',\r\n",
" '.infobox',\r\n",
" '#toc',\r\n",
" '.toc',\r\n",
" '.sidebar',\r\n",
" ]\r\n",
" for selector in selectors:\r\n",
" for el in content_node.select(selector):\r\n",
" el.decompose()\r\n",
"\r\n",
"if content:\r\n",
" # Clean the content node to better approximate article text only.\r\n",
" _clean_wikipedia_content(content)\r\n",
" text = content.get_text(separator=' ', strip=True)\r\n",
" print(text[:1000])\r\n",
"else:\r\n",
" print(\"Could not find main content. Using full page text.\")\r\n",
" text = soup.get_text(separator=' ', strip=True)\r\n",
" print(text[:1000])"
"from bs4 import BeautifulSoup\r\n\r\n# Parse the HTML content\r\nsoup = BeautifulSoup(text, 'html.parser')\r\n\r\n# Extract only the main article content from Wikipedia\r\n# Wikipedia uses 'mw-parser-output' class for the main article content\r\ncontent = soup.find('div', class_='mw-parser-output')\r\n\r\ndef clean_wikipedia_content(content_node):\r\n \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n selectors = [\r\n '.mw-jump-link',\r\n '.navbox',\r\n '.reflist',\r\n 'sup.reference',\r\n '.mw-editsection',\r\n '.hatnote',\r\n '.metadata',\r\n '.infobox',\r\n '#toc',\r\n '.toc',\r\n '.sidebar',\r\n ]\r\n for selector in selectors:\r\n for el in content_node.select(selector):\r\n el.decompose()\r\n\r\nif content:\r\n # Clean the content node to better approximate article text only.\r\n clean_wikipedia_content(content)\r\n text = content.get_text(separator=' ', strip=True)\r\n print(text[:1000])\r\nelse:\r\n print(\"Could not find main content. Using full page text.\")\r\n text = soup.get_text(separator=' ', strip=True)\r\n print(text[:1000])"
],
"outputs": [
{

@ -94,7 +94,7 @@
"cell_type": "code",
"execution_count": 4,
"source": [
"from bs4 import BeautifulSoup\r\n\r\n# Parse the HTML content\r\nsoup = BeautifulSoup(text, 'html.parser')\r\n\r\n# Extract only the main article content from Wikipedia\r\n# Wikipedia uses 'mw-parser-output' class for the main article content\r\ncontent = soup.find('div', class_='mw-parser-output')\r\n\r\ndef _clean_wikipedia_content(content_node):\r\n \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n selectors = [\r\n '.mw-jump-link',\r\n '.navbox',\r\n '.reflist',\r\n 'sup.reference',\r\n '.mw-editsection',\r\n '.hatnote',\r\n '.metadata',\r\n '.infobox',\r\n '#toc',\r\n '.toc',\r\n '.sidebar',\r\n ]\r\n for selector in selectors:\r\n for el in content_node.select(selector):\r\n el.decompose()\r\n\r\nif content:\r\n # Clean the content node to better approximate article text only.\r\n _clean_wikipedia_content(content)\r\n text = content.get_text(separator=' ', strip=True)\r\n print(text[:1000])\r\nelse:\r\n print(\"Could not find main content. Using full page text.\")\r\n text = soup.get_text(separator=' ', strip=True)\r\n print(text[:1000])"
"from bs4 import BeautifulSoup\r\n\r\n# Parse the HTML content\r\nsoup = BeautifulSoup(text, 'html.parser')\r\n\r\n# Extract only the main article content from Wikipedia\r\n# Wikipedia uses 'mw-parser-output' class for the main article content\r\ncontent = soup.find('div', class_='mw-parser-output')\r\n\r\ndef clean_wikipedia_content(content_node):\r\n \"\"\"Remove common non-article elements from a Wikipedia content node.\"\"\"\r\n # Strip jump links, navboxes, reference lists/superscripts, edit sections, TOC, sidebars, etc.\r\n selectors = [\r\n '.mw-jump-link',\r\n '.navbox',\r\n '.reflist',\r\n 'sup.reference',\r\n '.mw-editsection',\r\n '.hatnote',\r\n '.metadata',\r\n '.infobox',\r\n '#toc',\r\n '.toc',\r\n '.sidebar',\r\n ]\r\n for selector in selectors:\r\n for el in content_node.select(selector):\r\n el.decompose()\r\n\r\nif content:\r\n # Clean the content node to better approximate article text only.\r\n clean_wikipedia_content(content)\r\n text = content.get_text(separator=' ', strip=True)\r\n print(text[:1000])\r\nelse:\r\n print(\"Could not find main content. Using full page text.\")\r\n text = soup.get_text(separator=' ', strip=True)\r\n print(text[:1000])"
],
"outputs": [
{

Loading…
Cancel
Save