- 爬取网站
import urllib.request
response = urllib.request.urlopen('http://php.net/')
html = response.read()
print(html)
输出:
b'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en">\n<head>\n\n <meta charset="utf-8">\n <meta name="viewport" content="width=device-width, initial-scale=1.0"> \n\n <title>PHP: Hypertext Preprocessor</title>\n\n <link rel="shortcut icon" href="http://php.net/favicon.ico">\n <link rel="search" type="application/opensearchdescription+xml" href="http://php.net/phpnetimprovedsearch.src" title="Add PHP.net search">\n <link rel="alternate" type="application/atom+xml" href="http://php.net/releases/feed.php" title="PHP Release feed">\n <link rel="alternate" type="application/atom+xml" href="http://php.net/feed.atom" title="PHP: Hypertext Preprocessor">\n\n <link rel="canonical" href="http://php.net/index.php">\n <link rel="shorturl" href="http://php.net/index">\n <link rel="alternate" href="http://php.net/index" hreflang="x-default">\n\n\n\n<link rel="stylesheet" type="text/css" href="http://php.net/cached.php?t=1421837618&f=/fonts/Fira/fira.css" media="screen">\n<link rel="stylesheet" type="text/css" href="http://php.net/cached.php?t=1421837618&f=/fonts/Font-Awesome/css/fontello.css" media="screen">\n<link rel="stylesheet" type="text/css" href="http://php.net/cached.php?t=1478800802&f=/styles/theme-base.css" media="screen">\n<link rel="stylesheet" type="text/css" href="http://php.net/cached.php?t=1449787206&f=/styles/theme-medium.css" media="screen">\n<link rel="stylesheet" type="text/css" href="http://php.net/cached.php?t=1429259403&f=/styles/home.css" media="screen">\n\n <!--[if lte IE 7]>\n <link rel="stylesheet" type="text/css" href="http://php.net/styles/workarounds.ie7.css" media="screen">\n <![endif]-->\n\n <!--[if lte IE 8]>\n <script type="text/javascript">\n window.brokenIE = true;\n </script>\n <![endif]-->\n\n <!--[if lte IE 9]>\n <link rel="stylesheet" type="text/css" href="http://php.net/styles/workarounds.ie9.css" media="screen">\n <![endif]-->\n\n <!--[if IE]>\n <script type="text/javascript" src="http://php.net/js/ext/html5.js"></script>\n <![endif]-->\n\n <base href="http://php.net/index.php">\n\n</head>\n<body class="home ">\n\n<nav id="head-nav" class="navbar navbar-fixed-top">\n <div class="navbar-inner clearfix">\n <a href="/" class="brand"><img src="/images/logos/php-logo.svg" width="48" height="24" alt="php"></a>\n <div id="mainmenu-toggle-overlay"></div>\n <input type="checkbox" id="mainmenu-toggle">\n <ul class="nav">\n <li class=""><a href="/downloads">Downloads</a></li>\n <li class=""><a href="/docs.php">Documentation</a></li>\n <li class=""><a href="/get-involved" >Get Involved</a></li>\n <li class=""><a href="/support">Help</a></li>\n </ul>\n <form class="navbar-search" id="topsearch" action="/search.php">\n <input type="hidden" name="show" value="quickref">\n <input type="search" name="pattern" class="search-query" placeholder="Search" accesskey="s">\n </form>\n </div>\n <div id="flash-message"></div>\n</nav>\n<nav id="trick"><div><dl>\n<dt><a href=\'/manual/en/getting-started.php\'>Getting Started</a></dt>\n\t<dd><a href=\'/manual/en/introduction.php\'>Introduction</a></dd>\n\t<dd><a href=\'/manual/en/tutorial.php\'>A simple tutorial</a></dd>\n<dt><a href=\'/manual/en/langref.php\'>Language Reference</a></dt>\n\t<dd><a href=\'/manual/en/language.basic-syntax.php\'>Basic syntax</a></dd>\n\t<dd><a href=\'/manual/en/language.types.php\'>Types</a></dd>\n\t<dd><a href=\'/manual/en/language.variables.php\'>Variables</a></dd>\n\t<dd><a href=\'/manual/en/language.constants.php\'>Constants</a></dd>\n\t<dd><a href=\'/manual/en/language.expressions.php\'>Expressions</a></dd>\n\t<dd><a href=\'/manual/en/language.operators.php\'>Operators</a></dd>\n\t<dd><a href=\'/manual/en/language.control-structures.php\'>Control Structures</a></dd>\n\t<dd><a href=\'/manual/en/language.functions.php\'>Functions</a></dd>\n\t<dd><a href=\'/manual/en/language.oop5.php\'>Classes and Objects</a></dd>\n\t<dd><a href=\'/manual/en/language.namespaces.php\'>Namespaces</a></dd>\n\t<dd><a href=\'/manual/en/language.errors.php\'>Errors</a></dd>\n\t<dd><a href=\'/manual/en/language.exceptions.php\'>Exceptions</a></dd>\n\t<dd><a href=\'/manual/en/language.generators.php\'>Generators</a></dd>\n\t<dd><a href=\'/manual/en/language.references.php\'>References Explained</a></dd>\n\t<dd><a href=\'/manual/en/reserved.variables.php\'>Predefined Variables</a></dd>\n\t<dd><a href=\'/manual/en/reserved.exceptions.php\'>Predefined Exceptions</a></dd>\n\t<dd><a href=\'/manual/en/reserved.interfaces.php\'>Predefined Interfaces and Classes</a></dd>\n\t<dd><a href=\'/manual/en/context.php\'>Context options and parameters</a></dd>\n\t<dd><a href=\'/manual/en/wrappers.php\'>Supported Protocols and Wrappers</a></dd>\n</dl>\n<dl>\n<dt><a href=\'/manual/en/security.php\'>Security</a></dt>\n\t<dd><a href=\'/manual/en/security.intro.php\'>Introduction</a></dd>\n\t<dd><a href=\'/manual/en/security.general.php\'>General considerations</a></dd>\n\t<dd><a href=\'/manual/en/security.cgi-bin.php\'>Installed as CGI binary</a></dd>\n\t<dd><a href=\'/manual/en/security.apache.php\'>Installed as an Apache module</a></dd>\n\t<dd><a href=\'/manual/en/security.sessions.php\'>Session Security</a></dd>\n\t<dd><a href=\'/manual/en/security.filesystem.php\'>Filesystem Security</a></dd>\n\t<dd><a href=\'/manual/en/security.database.php\'>Database Security</a></dd>\n\t<dd><a href=\'/manual/en/security.errors.php\'>Error Reporting</a></dd>\n\t<dd><a href=\'/manual/en/security.globals.php\'>Using Register Globals</a></dd>\n\t<dd><a href=\'/manual/en/security.variables.php\'>User Submitted Data</a></dd>\n\t<dd><a href=\'/manual/en/security.magicquotes.php\'>Magic Quotes</a></dd>\n\t<dd><a href=\'/manual/en/security.hiding.php\'>Hiding PHP</a></dd>\n\t<dd><a href=\'/manual/en/security.current.php\'>Keeping Current</a></dd>\n<dt><a href=\'/manual/en/features.php\'>Features</a></dt>\n\t<dd><a href=\'/manual/en/features.http-auth.php\'>HTTP authentication with PHP</a></dd>\n\t<dd><a href=\'/manual/en/features.cookies.php\'>Cookies</a></dd>\n\t<dd><a href=\'/manual/en/features.sessions.php\'>Sessions</a></dd>\n\t<dd><a href=\'/manual/en/features.xforms.php\'>Dealing with XForms</a></dd>\n\t<dd><a href=\'/manual/en/features.file-upload.php\'>Handling file uploads</a></dd>\n\t<dd><a href=\'/manual/en/features.remote-files.php\'>Using remote files</a></dd>\n\t<dd><a href=\'/manual/en/features.connection-handling.php\'>Connection handling</a></dd>\n\t<dd><a href=\'/manual/en/features.persistent-connections.php\'>Persistent Database Connections</a></dd>\n\t<dd><a href=\'/manual/en/features.safe-mode.php\'>Safe Mode</a></dd>\n\t<dd><a href=\'/manual/en/features.commandline.php\'>Command line usage</a></dd>\n\t<dd><a href=\'/manual/en/features.gc.php\'>Garbage Collection</a></dd>\n\t<dd><a href=\'/manual/en/features.dtrace.php\'>DTrace Dynamic Tracing</a></dd>\n</dl>\n<dl>\n<dt><a href=\'/manual/en/funcref.php\'>Function Reference</a></dt>\n\t<dd><a href=\'/manual/en/refs.basic.php.php\'>Affecting PHP\'s Behaviour</a></dd>\n\t<dd><a href=\'/manual/en/refs.utilspec.audio.php\'>Audio Formats Manipulation</a></dd>\n\t<dd><a href=\'/manual/en/refs.remote.auth.php\'>Authentication Services</a></dd>\n\t<dd><a href=\'/manual/en/refs.utilspec.cmdline.php\'>Command Line Specific Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.compression.php\'>Compression and Archive Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.creditcard.php\'>Credit Card Processing</a></dd>\n\t<dd><a href=\'/manual/en/refs.crypto.php\'>Cryptography Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.database.php\'>Database Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.calendar.php\'>Date and Time Related Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.fileprocess.file.php\'>File System Related Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.international.php\'>Human Language and Character Encoding Support</a></dd>\n\t<dd><a href=\'/manual/en/refs.utilspec.image.php\'>Image Processing and Generation</a></dd>\n\t<dd><a href=\'/manual/en/refs.remote.mail.php\'>Mail Related Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.math.php\'>Mathematical Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.utilspec.nontext.php\'>Non-Text MIME Output</a></dd>\n\t<dd><a href=\'/manual/en/refs.fileprocess.process.php\'>Process Control Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.basic.other.php\'>Other Basic Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.remote.other.php\'>Other Services</a></dd>\n\t<dd><a href=\'/manual/en/refs.search.php\'>Search Engine Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.utilspec.server.php\'>Server Specific Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.basic.session.php\'>Session Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.basic.text.php\'>Text Processing</a></dd>\n\t<dd><a href=\'/manual/en/refs.basic.vartype.php\'>Variable and Type Related Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.webservice.php\'>Web Services</a></dd>\n\t<dd><a href=\'/manual/en/refs.utilspec.windows.php\'>Windows Only Extensions</a></dd>\n\t<dd><a href=\'/manual/en/refs.xml.php\'>XML Manipulation</a></dd>\n\t<dd><a href=\'/manual/en/refs.ui.php\'>GUI Extensions</a></dd>\n</dl>\n<dl>\n<dt>Keyboard Shortcuts</dt><dt>?</dt>\n<dd>This help</dd>\n<dt>j</dt>\n<dd>Next menu item</dd>\n<dt>k</dt>\n<dd>Previous menu item</dd>\n<dt>g p</dt>\n<dd>Previous man page</dd>\n<dt>g n</dt>\n<dd>Next man page</dd>\n<dt>G</dt>\n<dd>Scroll to bottom</dd>\n<dt>g g</dt>\n<dd>Scroll to top</dd>\n<dt>g h</dt>\n<dd>Goto homepage</dd>\n<dt>g s</dt>\n<dd>Goto search<br>(current page)</dd>\n<dt>/</dt>\n<dd>Focus search box</dd>\n</dl></div></nav>\n<div id="goto">\n <div class="search">\n <div class="text"></div>\n <div class="results"><ul></ul></div>\n </div>\n</div>\n\n\n\n<div id="intro" class="clearfix">\n <div class="container">\n <div class="row clearfix">\n <div class="blurb">\n <p>PHP is a popular general-purpose scripting language that is especially suited to web development.</p>\n <p>Fast, flexible and pragmatic, PHP powers everything from your blog to the most popular websites in the world.</p>\n </div>\n <div class="download">\n <h3>Download</h3><ul>\n\n <li><a class=\'download-link\' href=\'/downloads.php#v5.6.34\'>5.6.34</a><span class=\'dot\'>·</span><a class=\'notes\' href=\'/ChangeLog-5.php#5.6.34\'>Release Notes</a><span class=\'dot\'>·</span><a class=\'notes\' href=\'/migration56\'>Upgrading</a></li>\n\n <li><a class=\'download-link\' href=\'/downloads.php#v7.0.28\'>7.0.28</a><span class=\'dot\'>·</span><a class=\'notes\' href=\'/ChangeLog-7.php#7.0.28\'>Release Notes</a><span class=\'dot\'>·</span><a class=\'notes\' href=\'/migration70\'>Upgrading</a></li>\n\n <li><a class=\'download-link\' href=\'/downloads.php#v7.1.15\'>7.1.15</a><span class=\'dot\'>·</span><a class=\'notes\' href=\'/ChangeLog-7.php#7.1.15\'>Release Notes</a><span class=\'dot\'>·</span><a class=\'notes\' href=\'/migration71\'>Upgrading</a></li>\n\n <li><a class=\'download-link\' href=\'/downloads.php#v7.2.3\'>7.2.3</a><span class=\'dot\'>·</span><a class=\'notes\' href=\'/ChangeLog-7.php#7.2.3\'>Release Notes</a><span class=\'dot\'>·</span><a class=\'notes\' href=\'/migration72\'>Upgrading</a></li>\n</ul>\n </div>\n </div> </div>\n</div>\n\n\n<div id="layout" class="clearfix">\n <section id="layout-content">\n<div class=\'home-content\'><article class="newsentry">\n <header class="title">\n <time datetime="2018-03-02T05:54:19+00:00">02 Mar 2018</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2018.php#id2018-03-02-1" id="id2018-03-02-1">PHP 7.1.15 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>The PHP development team announces the immediate availability of PHP\n 7.1.15. This is a security fix release, containing one security fix and many bug fixes.\n \n All PHP 7.1 users are encouraged to upgrade to this version.\n </p>\n \n <p>For source downloads of PHP 7.1.15 please visit our <a href="http://www.php.net/downloads.php">downloads page</a>,\n Windows source and binaries can be found on <a href="http://windows.php.net/download/">windows.php.net/download/</a>.\n The list of changes is recorded in the <a href="http://www.php.net/ChangeLog-7.php#7.1.15">ChangeLog</a>.\n </p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2018-03-01T15:48:47-08:00">01 Mar 2018</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2018.php#id2018-03-01-3" id="id2018-03-01-3">PHP 5.6.34 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>The PHP development team announces the immediate availability of PHP\n 5.6.34. This is a security release. One security bug was fixed in\n this release.\n\n All PHP 5.6 users are encouraged to upgrade to this version.</p>\n\n <p>For source downloads of PHP 5.6.34 please visit our <a href="http://www.php.net/downloads.php">downloads page</a>,\n Windows source and binaries can be found on <a href="http://windows.php.net/download/">windows.php.net/download/</a>.\n The list of changes is recorded in the <a href="http://www.php.net/ChangeLog-5.php#5.6.34">ChangeLog</a>.\n </p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2018-03-01T19:43:30+00:00">01 Mar 2018</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2018.php#id2018-03-01-2" id="id2018-03-01-2">PHP 7.2.3 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>The PHP development team announces the immediate availability of PHP\n 7.2.3. This is a security release with also contains several minor bug fixes.</p>\n \n <p>All PHP 7.2 users are encouraged to upgrade to this version.</p>\n \n <p>For source downloads of PHP 7.2.3 please visit our <a href="http://www.php.net/downloads.php">downloads page</a>,\n Windows source and binaries can be found on <a href="http://windows.php.net/download/">windows.php.net/download/</a>.\n The list of changes is recorded in the <a href="http://www.php.net/ChangeLog-7.php#7.2.3">ChangeLog</a>.\n </p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2018-03-01T11:45:00+01:00">01 Mar 2018</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2018.php#id2018-03-01-1" id="id2018-03-01-1">PHP 7.0.28 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>The PHP development team announces the immediate availability of PHP\n 7.0.28. This is a security release. One security bug was fixed in\n this release.\n \n All PHP 7.0 users are encouraged to upgrade to this version.</p>\n\n <p>For source downloads of PHP 7.0.28 please visit our <a href="http://www.php.net/downloads.php">downloads page</a>,\n Windows source and binaries can be found on <a href="http://windows.php.net/download/">windows.php.net/download/</a>.\n The list of changes is recorded in the <a href="http://www.php.net/ChangeLog-7.php#7.0.28">ChangeLog</a>.\n </p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2018-02-01T14:54:13+00:00">01 Feb 2018</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2018.php#id2018-02-01-2" id="id2018-02-01-2">PHP 7.1.14 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>The PHP development team announces the immediate availability of PHP\n 7.1.14. This is a bugfix release. Several bugs were fixed\n in this release.</p>\n \n <p>All PHP 7.1 users are encouraged to upgrade to this version.</p>\n \n <p>For source downloads of PHP 7.1.14 please visit our <a href="http://www.php.net/downloads.php">downloads page</a>, Windows source and binaries can be found on <a href="http://windows.php.net/download/">windows.php.net/download/</a>.\n The list of changes is recorded in the <a href="http://www.php.net/ChangeLog-7.php#7.1.14">ChangeLog</a>.\n </p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2018-02-01T09:12:34+00:00">01 Feb 2018</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2018.php#id2018-02-01-1" id="id2018-02-01-1">PHP 7.2.2 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>The PHP development team announces the immediate availability of PHP\n 7.2.2. This is a bugfix release, with several bug fixes included.</p>\n \n <p>All PHP 7.2 users are encouraged to upgrade to this version.</p>\n \n <p>For source downloads of PHP 7.2.2 please visit our <a href="http://www.php.net/downloads.php">downloads page</a>,\n Windows source and binaries can be found on <a href="http://windows.php.net/download/">windows.php.net/download/</a>.\n The list of changes is recorded in the <a href="http://www.php.net/ChangeLog-7.php#7.2.2">ChangeLog</a>.\n </p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2018-01-04T12:21:10-08:00">04 Jan 2018</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2018.php#id2018-01-04-4" id="id2018-01-04-4">PHP 5.6.33 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>The PHP development team announces the immediate availability of PHP\n 5.6.33. This is a security release. Several security bugs were fixed in\n this release.\n\n All PHP 5.6 users are encouraged to upgrade to this version.</p>\n\n <p>For source downloads of PHP 5.6.33 please visit our <a href="http://www.php.net/downloads.php">downloads page</a>,\n Windows source and binaries can be found on <a href="http://windows.php.net/download/">windows.php.net/download/</a>.\n The list of changes is recorded in the <a href="http://www.php.net/ChangeLog-5.php#5.6.33">ChangeLog</a>.\n </p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2017-10-12T11:46:49+02:00">12 Oct 2017</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2017.php#id2017-10-12-1" id="id2017-10-12-1">PHP 7.2.0 Release Candidate 4 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>\n The PHP development team announces the immediate availability of PHP 7.2.0 RC4.\n This release is the fourth Release Candidate for 7.2.0.\n All users of PHP are encouraged to test this version carefully, and report any bugs\n and incompatibilities in the <a href="https://bugs.php.net/">bug tracking system</a>.\n </p>\n \n <p><strong>THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!</strong></p>\n \n <p>\n For more information on the new features and other changes, you can read the\n <a href="https://github.com/php/php-src/blob/php-7.2.0RC4/NEWS">NEWS</a> file,\n or the <a href="https://github.com/php/php-src/blob/php-7.2.0RC4/UPGRADING">UPGRADING</a>\n file for a complete list of upgrading notes. These files can also be found in the release archive.\n </p>\n \n <p>\n For source downloads of PHP 7.2.0 Release Candidate 4 please visit the\n <a href="https://downloads.php.net/~remi/">download</a> page,\n Windows sources and binaries can be found at\n <a href="http://windows.php.net/qa/">windows.php.net/qa/</a>.\n </p>\n \n <p>\n The next Release Candidate will be announced on the 26th of October.\n You can also read the full list of planned releases on\n <a href="https://wiki.php.net/todo/php72">our wiki</a>.\n </p>\n \n <p>Thank you for helping us make PHP better.</p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2017-09-28T12:58:56+02:00">28 Sep 2017</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2017.php#id2017-09-28-2" id="id2017-09-28-2">PHP 7.2.0 Release Candidate 3 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>\n The PHP development team announces the immediate availability of PHP 7.2.0 RC3.\n This release is the third Release Candidate for 7.2.0.\n All users of PHP are encouraged to test this version carefully, and report any bugs\n and incompatibilities in the <a href="https://bugs.php.net/">bug tracking system</a>.\n </p>\n \n <p><strong>THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!</strong></p>\n \n <p>\n For more information on the new features and other changes, you can read the\n <a href="https://github.com/php/php-src/blob/php-7.2.0RC3/NEWS">NEWS</a> file,\n or the <a href="https://github.com/php/php-src/blob/php-7.2.0RC3/UPGRADING">UPGRADING</a>\n file for a complete list of upgrading notes. These files can also be found in the release archive.\n </p>\n \n <p>\n For source downloads of PHP 7.2.0 Release Candidate 3 please visit the\n <a href="https://downloads.php.net/~remi/">download</a> page,\n Windows sources and binaries can be found at\n <a href="http://windows.php.net/qa/">windows.php.net/qa/</a>.\n </p>\n \n <p>\n The next Release Candidate will be announced on the 12th of October.\n You can also read the full list of planned releases on\n <a href="https://wiki.php.net/todo/php72">our wiki</a>.\n </p>\n \n <p>Thank you for helping us make PHP better.</p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2017-08-31T10:53:58+02:00">31 Aug 2017</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2017.php#id2017-08-31-1" id="id2017-08-31-1">PHP 7.2.0 Release Candidate 1 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>\n The PHP development team announces the immediate availability of PHP 7.2.0 Release\n Candidate 1. This release is the first Release Candidate for 7.2.0.\n All users of PHP are encouraged to test this version carefully, and report any bugs\n and incompatibilities in the <a href="https://bugs.php.net/">bug tracking system</a>.\n </p>\n\n <p><strong>THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!</strong></p>\n\n <p>\n For more information on the new features and other changes, you can read the\n <a href="https://github.com/php/php-src/blob/php-7.2.0RC1/NEWS">NEWS</a> file,\n or the <a href="https://github.com/php/php-src/blob/php-7.2.0RC1/UPGRADING">UPGRADING</a>\n file for a complete list of upgrading notes. These files can also be found in the release archive.\n </p>\n\n <p>\n For source downloads of PHP 7.2.0 Release Candidate 1 please visit the\n <a href="https://downloads.php.net/~remi/">download</a> page,\n Windows sources and binaries can be found at\n <a href="http://windows.php.net/qa/">windows.php.net/qa/</a>.\n </p>\n\n <p>\n The second Release Candidate will be released on the 14th of September.\n You can also read the full list of planned releases on\n <a href="https://wiki.php.net/todo/php72">our wiki</a>.\n </p>\n\n <p>Thank you for helping us make PHP better.</p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2017-08-17T10:17:44+02:00">17 Aug 2017</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2017.php#id2017-08-17-1" id="id2017-08-17-1">PHP 7.2.0 Beta 3 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>\n The PHP development team announces the immediate availability of PHP 7.2.0 Beta 3.\n This release is the third and final beta for 7.2.0. All users of PHP are encouraged\n to test this version carefully, and report any bugs and incompatibilities in the\n <a href="https://bugs.php.net/">bug tracking system</a>.\n </p>\n\n <p><strong>THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!</strong></p>\n\n <p>\n For more information on the new features and other changes, you can read the\n <a href="https://github.com/php/php-src/blob/php-7.2.0beta3/NEWS">NEWS</a> file,\n or the <a href="https://github.com/php/php-src/blob/php-7.2.0beta3/UPGRADING">UPGRADING</a>\n file for a complete list of upgrading notes. These files can also be found in the release archive.\n </p>\n\n <p>\n For source downloads of PHP 7.2.0 Beta 3 please visit the\n <a href="https://downloads.php.net/~remi/">download</a> page,\n Windows sources and binaries can be found at\n <a href="http://windows.php.net/qa/">windows.php.net/qa/</a>.\n </p>\n\n <p>\n The first Release Candidate will be released on the 31th of August.\n You can also read the full list of planned releases on\n <a href="https://wiki.php.net/todo/php72">our wiki</a>.\n </p>\n\n <p>Thank you for helping us make PHP better.</p>\n </div>\n \n </div>\n</article><article class="newsentry">\n <header class="title">\n <time datetime="2017-07-06T12:25:08+02:00">06 Jul 2017</time>\n <h2 class="newstitle">\n <a href="http://php.net/archive/2017.php#id2017-07-06-2" id="id2017-07-06-2">PHP 7.2.0 Alpha 3 Released</a>\n </h2>\n </header>\n <div class="newscontent">\n <div>\n <p>The PHP development team announces the immediate availability of PHP 7.2.0 Alpha 3.\n This release contains fixes and improvements relative to Alpha 2.\n All users of PHP are encouraged to test this version carefully,\n and report any bugs and incompatibilities in the\n <a href="https://bugs.php.net/">bug tracking system</a>.</p>\n\n <p><strong>THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!</strong></p>\n\n <p>For information on new features and other changes, you can read the\n <a href="https://github.com/php/php-src/blob/php-7.2.0alpha3/NEWS">NEWS</a> file,\n or the <a href="https://github.com/php/php-src/blob/php-7.2.0alpha3/UPGRADING">UPGRADING</a> file\n for a complete list of upgrading notes. These files can also be found in the release archive.</p>\n\n <p>For source downloads of PHP 7.2.0 Alpha 3 please visit the <a href="https://downloads.php.net/~remi/">download</a> page,\n Windows sources and binaries can be found on <a href="http://windows.php.net/qa/">windows.php.net/qa/</a>.</p>\n\n <p>The first beta will be released on the 20th of July. You can also read the full list of planned releases on our\n <a href="https://wiki.php.net/todo/php72#timetable">wiki</a>.</p>\n\n <p>Thank you for helping us make PHP better.</p>\n </div>\n \n </div>\n</article><p class="archive"><a href="/archive/">Older News Entries</a></p></div> </section><!-- layout-content -->\n \n<aside class="tips">\n <div class="inner">\n<div class="panel"> <a href="/conferences" class="headline" title="Conferences calling for papers">Conferences calling for papers</a><div class="body"><ul><li><a href=\'http://php.net/conferences/index.php#id2018-02-16-1\' title=\'Mid-Atlantic Developer Conference\'>Mid-Atlantic Developer Conference</a></li></ul></div></div><div class="panel"> <a href="/conferences" class="headline" title="Upcoming conferences">Upcoming conferences</a><div class="body"><ul><li><a href=\'http://php.net/conferences/index.php#id2018-02-14-1\' title=\'ConFoo: THE web development conference you don\xe2\x80\x99t want to miss!\'>ConFoo: THE web development conference you don\xe2\x80\x99t want to miss!</a></li><li><a href=\'http://php.net/conferences/index.php#id2018-02-07-2\' title=\'php[tek] 2018\'>php[tek] 2018</a></li><li><a href=\'http://php.net/conferences/index.php#id2018-02-01-3\' title=\'PHP Experience 2018\'>PHP Experience 2018</a></li><li><a href=\'http://php.net/conferences/index.php#id2018-01-09-1\' title=\'Dutch PHP Conference 2018\'>Dutch PHP Conference 2018</a></li></ul></div></div>\n <p class=\'panel\'><a href=\'/cal.php\'>User Group Events</a></p>\n <p class=\'panel\'><a href=\'/thanks.php\'>Special Thanks</a></p>\n <p class=\'panel social-media\'>\n <span class=\'headline\'>Social media</span>\n <div class=\'body\'>\n <ul>\n <li>\n <a href="https://twitter.com/official_php">\n <i class="icon-twitter"></i>\n @official_php\n </a>\n </li>\n </ul>\n </div>\n </p>\n</div>\n</aside>\n\n </div><!-- layout -->\n \n <footer>\n <div class="container footer-content">\n <div class="row-fluid">\n <ul class="footmenu">\n <li><a href="/copyright.php">Copyright © 2001-2018 The PHP Group</a></li>\n <li><a href="/my.php">My PHP.net</a></li>\n <li><a href="/contact.php">Contact</a></li>\n <li><a href="/sites.php">Other PHP.net sites</a></li>\n <li><a href="/mirrors.php">Mirror sites</a></li>\n <li><a href="/privacy.php">Privacy policy</a></li>\n </ul>\n </div>\n </div>\n </footer>\n\n <div class=\'elephpants\'><div class=images></div></div>\n <!-- External and third party libraries. -->\n <script type="text/javascript" src="//ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js"></script>\n<script type="text/javascript" src="http://php.net/cached.php?t=1421837618&f=/js/ext/modernizr.js"></script>\n<script type="text/javascript" src="http://php.net/cached.php?t=1421837618&f=/js/ext/hogan-2.0.0.min.js"></script>\n<script type="text/javascript" src="http://php.net/cached.php?t=1421837618&f=/js/ext/typeahead.min.js"></script>\n<script type="text/javascript" src="http://php.net/cached.php?t=1421837618&f=/js/ext/mousetrap.min.js"></script>\n<script type="text/javascript" src="http://php.net/cached.php?t=1421837618&f=/js/search.js"></script>\n<script type="text/javascript" src="http://php.net/cached.php?t=1516300802&f=/js/common.js"></script>\n\n<a id="toTop" href="javascript:;"><span id="toTopHover"></span><img width="40" height="40" alt="To Top" src="/images/to-top@2x.png"></a>\n\n</body>\n</html>\n\n'
- 转换为干净文本
import urllib.request
from bs4 import BeautifulSoup
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)
# -- text -- 获取了一个干净的文本
print(text)
输出为:
PHP: Hypertext PreprocessorDownloadsDocumentationGet InvolvedHelpGetting StartedIntroductionA simple tutorialLanguage ReferenceBasic syntaxTypesVariablesConstantsExpressionsOperatorsControl StructuresFunctionsClasses and ObjectsNamespacesErrorsExceptionsGeneratorsReferences ExplainedPredefined VariablesPredefined ExceptionsPredefined Interfaces and ClassesContext options and parametersSupported Protocols and WrappersSecurityIntroductionGeneral considerationsInstalled as CGI binaryInstalled as an Apache moduleSession SecurityFilesystem SecurityDatabase SecurityError ReportingUsing Register GlobalsUser Submitted DataMagic QuotesHiding PHPKeeping CurrentFeaturesHTTP authentication with PHPCookiesSessionsDealing with XFormsHandling file uploadsUsing remote filesConnection handlingPersistent Database ConnectionsSafe ModeCommand line usageGarbage CollectionDTrace Dynamic TracingFunction ReferenceAffecting PHP's BehaviourAudio Formats ManipulationAuthentication ServicesCommand Line Specific ExtensionsCompression and Archive ExtensionsCredit Card ProcessingCryptography ExtensionsDatabase ExtensionsDate and Time Related ExtensionsFile System Related ExtensionsHuman Language and Character Encoding SupportImage Processing and GenerationMail Related ExtensionsMathematical ExtensionsNon-Text MIME OutputProcess Control ExtensionsOther Basic ExtensionsOther ServicesSearch Engine ExtensionsServer Specific ExtensionsSession ExtensionsText ProcessingVariable and Type Related ExtensionsWeb ServicesWindows Only ExtensionsXML ManipulationGUI ExtensionsKeyboard Shortcuts?This helpjNext menu itemkPrevious menu itemg pPrevious man pageg nNext man pageGScroll to bottomg gScroll to topg hGoto homepageg sGoto search(current page)/Focus search boxPHP is a popular general-purpose scripting language that is especially suited to web development.Fast, flexible and pragmatic, PHP powers everything from your blog to the most popular websites in the world.Download5.6.34·Release Notes·Upgrading7.0.28·Release Notes·Upgrading7.1.15·Release Notes·Upgrading7.2.3·Release Notes·Upgrading02 Mar 2018PHP 7.1.15 ReleasedThe PHP development team announces the immediate availability of PHP
7.1.15. This is a security fix release, containing one security fix and many bug fixes.
All PHP 7.1 users are encouraged to upgrade to this version.For source downloads of PHP 7.1.15 please visit ourdownloads page,
Windows source and binaries can be found onwindows.php.net/download/.
The list of changes is recorded in theChangeLog.01 Mar 2018PHP 5.6.34 ReleasedThe PHP development team announces the immediate availability of PHP
5.6.34. This is a security release. One security bug was fixed in
this release.
All PHP 5.6 users are encouraged to upgrade to this version.For source downloads of PHP 5.6.34 please visit ourdownloads page,
Windows source and binaries can be found onwindows.php.net/download/.
The list of changes is recorded in theChangeLog.01 Mar 2018PHP 7.2.3 ReleasedThe PHP development team announces the immediate availability of PHP
7.2.3. This is a security release with also contains several minor bug fixes.All PHP 7.2 users are encouraged to upgrade to this version.For source downloads of PHP 7.2.3 please visit ourdownloads page,
Windows source and binaries can be found onwindows.php.net/download/.
The list of changes is recorded in theChangeLog.01 Mar 2018PHP 7.0.28 ReleasedThe PHP development team announces the immediate availability of PHP
7.0.28. This is a security release. One security bug was fixed in
this release.
All PHP 7.0 users are encouraged to upgrade to this version.For source downloads of PHP 7.0.28 please visit ourdownloads page,
Windows source and binaries can be found onwindows.php.net/download/.
The list of changes is recorded in theChangeLog.01 Feb 2018PHP 7.1.14 ReleasedThe PHP development team announces the immediate availability of PHP
7.1.14. This is a bugfix release. Several bugs were fixed
in this release.All PHP 7.1 users are encouraged to upgrade to this version.For source downloads of PHP 7.1.14 please visit ourdownloads page, Windows source and binaries can be found onwindows.php.net/download/.
The list of changes is recorded in theChangeLog.01 Feb 2018PHP 7.2.2 ReleasedThe PHP development team announces the immediate availability of PHP
7.2.2. This is a bugfix release, with several bug fixes included.All PHP 7.2 users are encouraged to upgrade to this version.For source downloads of PHP 7.2.2 please visit ourdownloads page,
Windows source and binaries can be found onwindows.php.net/download/.
The list of changes is recorded in theChangeLog.04 Jan 2018PHP 5.6.33 ReleasedThe PHP development team announces the immediate availability of PHP
5.6.33. This is a security release. Several security bugs were fixed in
this release.
All PHP 5.6 users are encouraged to upgrade to this version.For source downloads of PHP 5.6.33 please visit ourdownloads page,
Windows source and binaries can be found onwindows.php.net/download/.
The list of changes is recorded in theChangeLog.12 Oct 2017PHP 7.2.0 Release Candidate 4 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 RC4.
This release is the fourth Release Candidate for 7.2.0.
All users of PHP are encouraged to test this version carefully, and report any bugs
and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,
or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 4 please visit thedownloadpage,
Windows sources and binaries can be found atwindows.php.net/qa/.The next Release Candidate will be announced on the 26th of October.
You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.28 Sep 2017PHP 7.2.0 Release Candidate 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 RC3.
This release is the third Release Candidate for 7.2.0.
All users of PHP are encouraged to test this version carefully, and report any bugs
and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,
or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 3 please visit thedownloadpage,
Windows sources and binaries can be found atwindows.php.net/qa/.The next Release Candidate will be announced on the 12th of October.
You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.31 Aug 2017PHP 7.2.0 Release Candidate 1 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Release
Candidate 1. This release is the first Release Candidate for 7.2.0.
All users of PHP are encouraged to test this version carefully, and report any bugs
and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,
or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Release Candidate 1 please visit thedownloadpage,
Windows sources and binaries can be found atwindows.php.net/qa/.The second Release Candidate will be released on the 14th of September.
You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.17 Aug 2017PHP 7.2.0 Beta 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Beta 3.
This release is the third and final beta for 7.2.0. All users of PHP are encouraged
to test this version carefully, and report any bugs and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For more information on the new features and other changes, you can read theNEWSfile,
or theUPGRADINGfile for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Beta 3 please visit thedownloadpage,
Windows sources and binaries can be found atwindows.php.net/qa/.The first Release Candidate will be released on the 31th of August.
You can also read the full list of planned releases onour wiki.Thank you for helping us make PHP better.06 Jul 2017PHP 7.2.0 Alpha 3 ReleasedThe PHP development team announces the immediate availability of PHP 7.2.0 Alpha 3.
This release contains fixes and improvements relative to Alpha 2.
All users of PHP are encouraged to test this version carefully,
and report any bugs and incompatibilities in thebug tracking system.THIS IS A DEVELOPMENT PREVIEW - DO NOT USE IT IN PRODUCTION!For information on new features and other changes, you can read theNEWSfile,
or theUPGRADINGfile
for a complete list of upgrading notes. These files can also be found in the release archive.For source downloads of PHP 7.2.0 Alpha 3 please visit thedownloadpage,
Windows sources and binaries can be found onwindows.php.net/qa/.The first beta will be released on the 20th of July. You can also read the full list of planned releases on ourwiki.Thank you for helping us make PHP better.Older News EntriesConferences calling for papersMid-Atlantic Developer ConferenceUpcoming conferencesConFoo: THE web development conference you don’t want to miss!php[tek] 2018PHP Experience 2018Dutch PHP Conference 2018User Group EventsSpecial ThanksSocial media@official_phpCopyright © 2001-2018 The PHP GroupMy PHP.netContactOther PHP.net sitesMirror sitesPrivacy policy
- 转换为tokens
import urllib.request
from bs4 import BeautifulSoup
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)
# -- text -- 获取了一个干净的文本
# -- 将文本转换为tokens
tokens = text.split()
print(tokens)
输出为:
['PHP:', 'Hypertext', 'PreprocessorDownloadsDocumentationGet', 'InvolvedHelpGetting', 'StartedIntroductionA', 'simple', 'tutorialLanguage', 'ReferenceBasic', 'syntaxTypesVariablesConstantsExpressionsOperatorsControl', 'StructuresFunctionsClasses', 'and', 'ObjectsNamespacesErrorsExceptionsGeneratorsReferences', 'ExplainedPredefined', 'VariablesPredefined', 'ExceptionsPredefined', 'Interfaces', 'and', 'ClassesContext', 'options', 'and', 'parametersSupported', 'Protocols', 'and', 'WrappersSecurityIntroductionGeneral', 'considerationsInstalled', 'as', 'CGI', 'binaryInstalled', 'as', 'an', 'Apache', 'moduleSession', 'SecurityFilesystem', 'SecurityDatabase', 'SecurityError', 'ReportingUsing', 'Register', 'GlobalsUser', 'Submitted', 'DataMagic', 'QuotesHiding', 'PHPKeeping', 'CurrentFeaturesHTTP', 'authentication', 'with', 'PHPCookiesSessionsDealing', 'with', 'XFormsHandling', 'file', 'uploadsUsing', 'remote', 'filesConnection', 'handlingPersistent', 'Database', 'ConnectionsSafe', 'ModeCommand', 'line', 'usageGarbage', 'CollectionDTrace', 'Dynamic', 'TracingFunction', 'ReferenceAffecting', "PHP's", 'BehaviourAudio', 'Formats', 'ManipulationAuthentication', 'ServicesCommand', 'Line', 'Specific', 'ExtensionsCompression', 'and', 'Archive', 'ExtensionsCredit', 'Card', 'ProcessingCryptography', 'ExtensionsDatabase', 'ExtensionsDate', 'and', 'Time', 'Related', 'ExtensionsFile', 'System', 'Related', 'ExtensionsHuman', 'Language', 'and', 'Character', 'Encoding', 'SupportImage', 'Processing', 'and', 'GenerationMail', 'Related', 'ExtensionsMathematical', 'ExtensionsNon-Text', 'MIME', 'OutputProcess', 'Control', 'ExtensionsOther', 'Basic', 'ExtensionsOther', 'ServicesSearch', 'Engine', 'ExtensionsServer', 'Specific', 'ExtensionsSession', 'ExtensionsText', 'ProcessingVariable', 'and', 'Type', 'Related', 'ExtensionsWeb', 'ServicesWindows', 'Only', 'ExtensionsXML', 'ManipulationGUI', 'ExtensionsKeyboard', 'Shortcuts?This', 'helpjNext', 'menu', 'itemkPrevious', 'menu', 'itemg', 'pPrevious', 'man', 'pageg', 'nNext', 'man', 'pageGScroll', 'to', 'bottomg', 'gScroll', 'to', 'topg', 'hGoto', 'homepageg', 'sGoto', 'search(current', 'page)/Focus', 'search', 'boxPHP', 'is', 'a', 'popular', 'general-purpose', 'scripting', 'language', 'that', 'is', 'especially', 'suited', 'to', 'web', 'development.Fast,', 'flexible', 'and', 'pragmatic,', 'PHP', 'powers', 'everything', 'from', 'your', 'blog', 'to', 'the', 'most', 'popular', 'websites', 'in', 'the', 'world.Download5.6.34·Release', 'Notes·Upgrading7.0.28·Release', 'Notes·Upgrading7.1.15·Release', 'Notes·Upgrading7.2.3·Release', 'Notes·Upgrading02', 'Mar', '2018PHP', '7.1.15', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.1.15.', 'This', 'is', 'a', 'security', 'fix', 'release,', 'containing', 'one', 'security', 'fix', 'and', 'many', 'bug', 'fixes.', 'All', 'PHP', '7.1', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.1.15', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.01', 'Mar', '2018PHP', '5.6.34', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '5.6.34.', 'This', 'is', 'a', 'security', 'release.', 'One', 'security', 'bug', 'was', 'fixed', 'in', 'this', 'release.', 'All', 'PHP', '5.6', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '5.6.34', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.01', 'Mar', '2018PHP', '7.2.3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.3.', 'This', 'is', 'a', 'security', 'release', 'with', 'also', 'contains', 'several', 'minor', 'bug', 'fixes.All', 'PHP', '7.2', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.2.3', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.01', 'Mar', '2018PHP', '7.0.28', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.0.28.', 'This', 'is', 'a', 'security', 'release.', 'One', 'security', 'bug', 'was', 'fixed', 'in', 'this', 'release.', 'All', 'PHP', '7.0', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.0.28', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.01', 'Feb', '2018PHP', '7.1.14', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.1.14.', 'This', 'is', 'a', 'bugfix', 'release.', 'Several', 'bugs', 'were', 'fixed', 'in', 'this', 'release.All', 'PHP', '7.1', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.1.14', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.01', 'Feb', '2018PHP', '7.2.2', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.2.', 'This', 'is', 'a', 'bugfix', 'release,', 'with', 'several', 'bug', 'fixes', 'included.All', 'PHP', '7.2', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '7.2.2', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.04', 'Jan', '2018PHP', '5.6.33', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '5.6.33.', 'This', 'is', 'a', 'security', 'release.', 'Several', 'security', 'bugs', 'were', 'fixed', 'in', 'this', 'release.', 'All', 'PHP', '5.6', 'users', 'are', 'encouraged', 'to', 'upgrade', 'to', 'this', 'version.For', 'source', 'downloads', 'of', 'PHP', '5.6.33', 'please', 'visit', 'ourdownloads', 'page,', 'Windows', 'source', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/download/.', 'The', 'list', 'of', 'changes', 'is', 'recorded', 'in', 'theChangeLog.12', 'Oct', '2017PHP', '7.2.0', 'Release', 'Candidate', '4', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'RC4.', 'This', 'release', 'is', 'the', 'fourth', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '4', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'will', 'be', 'announced', 'on', 'the', '26th', 'of', 'October.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.28', 'Sep', '2017PHP', '7.2.0', 'Release', 'Candidate', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'RC3.', 'This', 'release', 'is', 'the', 'third', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'next', 'Release', 'Candidate', 'will', 'be', 'announced', 'on', 'the', '12th', 'of', 'October.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.31', 'Aug', '2017PHP', '7.2.0', 'Release', 'Candidate', '1', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '1.', 'This', 'release', 'is', 'the', 'first', 'Release', 'Candidate', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Release', 'Candidate', '1', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'second', 'Release', 'Candidate', 'will', 'be', 'released', 'on', 'the', '14th', 'of', 'September.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.17', 'Aug', '2017PHP', '7.2.0', 'Beta', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Beta', '3.', 'This', 'release', 'is', 'the', 'third', 'and', 'final', 'beta', 'for', '7.2.0.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'more', 'information', 'on', 'the', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Beta', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'atwindows.php.net/qa/.The', 'first', 'Release', 'Candidate', 'will', 'be', 'released', 'on', 'the', '31th', 'of', 'August.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'onour', 'wiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.06', 'Jul', '2017PHP', '7.2.0', 'Alpha', '3', 'ReleasedThe', 'PHP', 'development', 'team', 'announces', 'the', 'immediate', 'availability', 'of', 'PHP', '7.2.0', 'Alpha', '3.', 'This', 'release', 'contains', 'fixes', 'and', 'improvements', 'relative', 'to', 'Alpha', '2.', 'All', 'users', 'of', 'PHP', 'are', 'encouraged', 'to', 'test', 'this', 'version', 'carefully,', 'and', 'report', 'any', 'bugs', 'and', 'incompatibilities', 'in', 'thebug', 'tracking', 'system.THIS', 'IS', 'A', 'DEVELOPMENT', 'PREVIEW', '-', 'DO', 'NOT', 'USE', 'IT', 'IN', 'PRODUCTION!For', 'information', 'on', 'new', 'features', 'and', 'other', 'changes,', 'you', 'can', 'read', 'theNEWSfile,', 'or', 'theUPGRADINGfile', 'for', 'a', 'complete', 'list', 'of', 'upgrading', 'notes.', 'These', 'files', 'can', 'also', 'be', 'found', 'in', 'the', 'release', 'archive.For', 'source', 'downloads', 'of', 'PHP', '7.2.0', 'Alpha', '3', 'please', 'visit', 'thedownloadpage,', 'Windows', 'sources', 'and', 'binaries', 'can', 'be', 'found', 'onwindows.php.net/qa/.The', 'first', 'beta', 'will', 'be', 'released', 'on', 'the', '20th', 'of', 'July.', 'You', 'can', 'also', 'read', 'the', 'full', 'list', 'of', 'planned', 'releases', 'on', 'ourwiki.Thank', 'you', 'for', 'helping', 'us', 'make', 'PHP', 'better.Older', 'News', 'EntriesConferences', 'calling', 'for', 'papersMid-Atlantic', 'Developer', 'ConferenceUpcoming', 'conferencesConFoo:', 'THE', 'web', 'development', 'conference', 'you', 'don’t', 'want', 'to', 'miss!php[tek]', '2018PHP', 'Experience', '2018Dutch', 'PHP', 'Conference', '2018User', 'Group', 'EventsSpecial', 'ThanksSocial', 'media@official_phpCopyright', '©', '2001-2018', 'The', 'PHP', 'GroupMy', 'PHP.netContactOther', 'PHP.net', 'sitesMirror', 'sitesPrivacy', 'policy']
- 完整版 python爬取文字加分词预处理(英文)
import nltk
# nltk.download()
import urllib.request
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块
text = soup.get_text(strip=True)
# -- text -- 获取了一个干净的文本
# -- 将文本转换为tokens
tokens = text.split()
# # -- 计算频率
# freq = nltk.FreqDist(tokens)
# for key,val in freq.items():
# print(str(key)+':'+str(val))
#
# # -- 画图
# freq.plot(20,cumulative=False)
# -- 处理停用词
# stopwords.words('english')
clean_tokens = list()
sr = stopwords.words('english')
# 处理停用词
for token in tokens:
if token not in sr:
clean_tokens.append(token)
# -- 计算频率
freq = nltk.FreqDist(clean_tokens)
for key,val in freq.items():
print(str(key)+':'+str(val))
# -- 画图
freq.plot(20,cumulative=False)