<a href="http://www.jianshu.com/p/54870e9541fc">总目录</a>
课程页面:https://www.udacity.com/course/intro-to-computer-science--cs101
授课教师:Dave Evans https://www.cs.virginia.edu/~evans/
如下内容包含课程笔记和自己的扩展折腾
Add to index
def add_to_index(index,keyword,url):
for i in range(len(index)):
if keyword in index[i]:
index[i][1] + [url]
break
index.append([keyword, [url]])
return index# -*- coding: utf-8 -*-
# Define a procedure, add_to_index,
# that takes 3 inputs:
# - an index: [[<keyword>,[<url>,...]],...]
# - a keyword: String
# - a url: String
# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.
# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]
index = []
def add_to_index(index,keyword,url):
for i in range(len(index)):
if keyword in index[i]: # 改成index[i][0] == keyword比较严谨
index[i][1] += [url] # 也可以写成index[i][1].append(url)
return index
index.append([keyword, [url]])
return index
add_to_index(index,'udacity','http://udacity.com')
add_to_index(index,'computing','http://acm.org')
add_to_index(index,'udacity','http://npr.org')
print index
#>>> [['udacity', ['http://udacity.com', 'http://npr.org']],
#>>> ['computing', ['http://acm.org']]]
Lookup
# -*- coding: utf-8 -*-
# Define a procedure, lookup,
# that takes two inputs:
# - an index
# - keyword
# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.
index = [['udacity', ['http://udacity.com', 'http://npr.org']],
['computing', ['http://acm.org']]]
def lookup(index,keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
print lookup(index,'udacity')
#>>> ['http://udacity.com','http://npr.org']
Add page to index
# -*- coding: utf-8 -*-
# Define a procedure, add_page_to_index,
# that takes three inputs:
# - index
# - url (String)
# - content (String)
# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.
index = []
def add_to_index(index,keyword,url):
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
index.append([keyword,[url]])
def add_page_to_index(index,url,content):
content = content.split()
for item in content:
add_to_index(index, item, url)
add_page_to_index(index,'fake.text',"This is a test")
print index
#>>> [['This', ['fake.text']], ['is', ['fake.text']], ['a', ['fake.text']],
#>>> ['test',['fake.text']]]
Better Splitting
# 1 Gold Star
# The built-in <string>.split() procedure works
# okay, but fails to find all the words on a page
# because it only uses whitespace to split the
# string. To do better, we should also use punctuation
# marks to split the page into words.
# Define a procedure, split_string, that takes two
# inputs: the string to split and a string containing
# all of the characters considered separators. The
# procedure should return a list of strings that break
# the source string up by the characters in the
# splitlist.
# 这个是我的算法,Udacity给的算法和我的思路些许不一样
def split_string(source,splitlist):
r = []
word = ""
for char in source:
if char not in splitlist:
word += char
else:
if word != "":
r.append(word)
word = ""
if not word == "":
r.append(word)
return r
"""
def split_string(source, splitlist):
output = []
atsplit = True
for char in source:
print char
if char in splitlist:
atsplit = True
else:
if atsplit:
output.append(char)
atsplit = False
print output
else:
output[-1] = output[-1] + char
print output
return output
"""
out = split_string("This is a test-of the,string separation-code!"," ,!-")
print out
#>>> ['This', 'is', 'a', 'test', 'of', 'the', 'string', 'separation', 'code']
out = split_string("After the flood ... all the colors came out.", " .")
print out
#>>> ['After', 'the', 'flood', 'all', 'the', 'colors', 'came', 'out']
out = split_string("First Name,Last Name,Street Address,City,State,Zip Code",",")
print out
#>>>['First Name', 'Last Name', 'Street Address', 'City', 'State', 'Zip Code']
Improving the Index
# -*- coding: utf-8 -*-
# The current index includes a url in the list of urls
# for a keyword multiple times if the keyword appears
# on that page more than once.
# It might be better to only include the same url
# once in the url list for a keyword, even if it appears
# many times.
# Modify add_to_index so that a given url is only
# included once in the url list for a keyword,
# no matter how many times that keyword appears.
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
if url not in entry[1]:
entry[1].append(url)
return
# not found, add new keyword to index
index.append([keyword, [url]])
def get_page(url):
try:
if url == "http://www.udacity.com/cs101x/index.html":
return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body>
</html>'''
elif url == "http://www.udacity.com/cs101x/crawling.html":
return '''<html> <body> I have not learned to crawl yet, but I am
quite good at <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''
elif url == "http://www.udacity.com/cs101x/walking.html":
return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a></body></html>'''
elif url == "http://www.udacity.com/cs101x/flying.html":
return '''<html>
<body>The magic words are Squeamish Ossifrage!</body></html>'''
except:
return ""
return ""
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = []
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
union(tocrawl, get_all_links(content))
crawled.append(page)
return index
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def lookup(index, keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return None
index = crawl_web("http://www.udacity.com/cs101x/index.html")
print lookup(index,"is")
#>>> ['http://www.udacity.com/cs101x/index.html']
Counting Clicks
# 2 Gold Stars
# One way search engines rank pages
# is to count the number of times a
# searcher clicks on a returned link.
# This indicates that the person doing
# the query thought this was a useful
# link for the query, so it should be
# higher in the rankings next time.
# (In Unit 6, we will look at a different
# way of ranking pages that does not depend
# on user clicks.)
# ___Modify the index___ such that for each url in a
# list for a keyword, there is also a number
# that counts the number of times a user
# clicks on that link for this keyword.
# The result of lookup(index,keyword) should
# now be a list of url entries, where each url
# entry is a list of a url and a number
# indicating the number of times that url
# was clicked for this query keyword.
# You should define ___a new procedure___ to simulate
# user clicks for a given link:
# ___record_user_click(index,word,url)___
# that modifies the entry in the index for
# the input word by increasing the count associated
# with the url by 1.
# You also will have to ____modify add_to_index____
# in order to correctly create the new data
# structure, and to prevent the repetition of
# entries as in homework 4-5.
def record_user_click(index,keyword,url):
for entry in index:
if entry[0] == keyword:
for item in entry[1]:
if item[0] == url:
item[1] +=1
return
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
for item in entry[1]:
if item[0] == url:
return
entry[1].append([url, 0])
# not found, add new keyword to index
index.append([keyword, [[url, 0]]])
def get_page(url):
try:
if url == "http://www.udacity.com/cs101x/index.html":
return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body></html>'''
elif url == "http://www.udacity.com/cs101x/crawling.html":
return '''<html> <body> I have not learned to crawl yet, but I am
quite good at <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''
elif url == "http://www.udacity.com/cs101x/walking.html":
return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a>!</body></html>'''
elif url == "http://www.udacity.com/cs101x/flying.html":
return '<html><body>The magic words are Squeamish Ossifrage!</body></html>'
except:
return ""
return ""
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = []
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
union(tocrawl, get_all_links(content))
crawled.append(page)
return index
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def lookup(index, keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return None
#Here is an example showing a sequence of interactions:
index = crawl_web('http://www.udacity.com/cs101x/index.html')
print lookup(index, 'good')
#>>> [['http://www.udacity.com/cs101x/index.html', 0],
#>>> ['http://www.udacity.com/cs101x/crawling.html', 0]]
record_user_click(index, 'good', 'http://www.udacity.com/cs101x/crawling.html')
print lookup(index, 'good')
#>>> [['http://www.udacity.com/cs101x/index.html', 0],
#>>> ['http://www.udacity.com/cs101x/crawling.html', 1]]
Word Count
# -*- coding: utf-8 -*-
# Write a procedure, count_words, which takes as input a string
# and returns the number of words in the string. You may consider words
# as strings of characters separated by spaces.
def count_words(strings):
word = ""
r = 0
for char in strings:
if char != " ":
word += char
else:
if word != "":
r += 1
word = ""
if not word == "":
r += 1
return r
passage =("The number of orderings of the 52 cards in a deck of cards "
"is so great that if every one of the almost 7 billion people alive "
"today dealt one ordering of the cards per second, it would take "
"2.5 * 10**40 times the age of the universe to order the cards in every "
"possible way.")
#print passage
#>>>The number of orderings of the 52 cards in a deck of cards is so great that if every one of the almost 7 billion people alive today dealt one ordering of the cards per second, it would take 2.5 * 10**40 times the age of the universe to order the cards in every possible way.
print count_words(passage)
#>>>56
speed_fraction
有关这个练习当中我出错的地方,写了个笔记。
下面是正确的代码:
# -*- coding: utf-8 -*-
# Write a procedure, speed_fraction, which takes as its inputs the result of
# a traceroute (in ms) and distance (in km) between two points. It should
# return the speed the data travels as a decimal fraction of the speed of
# light.
speed_of_light = 300000. # km per second
def speed_fraction(traceroute, distance):
speed = distance / (traceroute*1.0/2/1000.)
return speed / (speed_of_light)
print speed_fraction(50,5000)
#>>> 0.666666666667
print speed_fraction(50,10000)
#>>> 1.33333333333 # Any thoughts about this answer, or these inputs?
print speed_fraction(75,4500)
#>>> 0.4
Converting Seconds
# -*- coding: utf-8 -*-
# Write a procedure, convert_seconds, which takes as input a non-negative
# number of seconds and returns a string of the form
# '<integer> hours, <integer> minutes, <number> seconds' but
# where if <integer> is 1 for the number of hours or minutes,
# then it should be hour/minute. Further, <number> may be an integer
# or decimal, and if it is 1, then it should be followed by second.
# You might need to use int() to turn a decimal into a float depending
# on how you code this. int(3.0) gives 3
#
# Note that English uses the plural when talking about 0 items, so
# it should be "0 minutes".
#
#算法: 余数
# 1 hour = 3600 seconds
# 1 minute = 60 seconds
def convert_seconds(time):
hour = int(time // 3600)
time -= hour * 3600
minute = int(time // 60)
time -= minute * 60
second = time
result = ""
if hour == 1:
result += "1 hour, "
else:
result += "%s hours, " % hour
if minute == 1:
result += "1 minute, "
else:
result += "%s minutes, " % minute
if second == 1:
result += "1 second"
else:
result += "%s seconds" % second
return result
print convert_seconds(3661)
#>>> 1 hour, 1 minute, 1 second
print convert_seconds(7325)
#>>> 2 hours, 2 minutes, 5 seconds
print convert_seconds(7261.7)
#>>> 2 hours, 1 minute, 1.7 seconds
Download Calculator
# -*- coding: utf-8 -*-
# Write a procedure download_time which takes as inputs a file size, the
# units that file size is given in, bandwidth and the units for
# bandwidth (excluding per second) and returns the time taken to download
# the file.
# Your answer should be a string in the form
# "<number> hours, <number> minutes, <number> seconds"
# Some information you might find useful is the number of bits
# in kilobits (kb), kilobytes (kB), megabits (Mb), megabytes (MB),
# gigabits (Gb), gigabytes (GB) and terabits (Tb), terabytes (TB).
#print 2 ** 10 # one kilobit, kb
#print 2 ** 10 * 8 # one kilobyte, kB
#print 2 ** 20 # one megabit, Mb
#print 2 ** 20 * 8 # one megabyte, MB
#print 2 ** 30 # one gigabit, Gb
#print 2 ** 30 * 8 # one gigabyte, GB
#print 2 ** 40 # one terabit, Tb
#print 2 ** 40 * 8 # one terabyte, TB
# Often bandwidth is given in megabits (Mb) per second whereas file size
# is given in megabytes (MB).
def download_time(file_num, file_unit, bandwidth_num, bandwidth_unit):
tobit = [
["kb", 2**10],
["kB", 2**10*8],
["Mb", 2**20],
["MB", 2**20*8],
["Gb", 2**30],
["GB", 2**30*8],
["Tb", 2**40],
["TB", 2**40*8]
]
for entry in tobit:
if entry[0] == file_unit:
total_bit_file = entry[1] * file_num
if entry[0] == bandwidth_unit:
total_bit_bandwidth = entry[1] * bandwidth_num
dl_seconds = total_bit_file * 1.0 / total_bit_bandwidth
return convert_seconds(dl_seconds)
def convert_seconds(time):
hour = int(time // 3600)
time -= hour * 3600
minute = int(time // 60)
time -= minute * 60
second = time
result = ""
if hour == 1:
result += "1 hour, "
else:
result += "%s hours, " % hour
if minute == 1:
result += "1 minute, "
else:
result += "%s minutes, " % minute
if second == 1:
result += "1 second"
else:
result += "%s seconds" % second
return result
print download_time(1024,'kB', 1, 'MB')
#>>> 0 hours, 0 minutes, 1 second
print download_time(1024,'kB', 1, 'Mb')
#>>> 0 hours, 0 minutes, 8 seconds # 8.0 seconds is also acceptable
print download_time(13,'GB', 5.6, 'MB')
#>>> 0 hours, 39 minutes, 37.1428571429 seconds
print download_time(13,'GB', 5.6, 'Mb')
#>>> 5 hours, 16 minutes, 57.1428571429 seconds
print download_time(10,'MB', 2, 'kB')
#>>> 1 hour, 25 minutes, 20 seconds # 20.0 seconds is also acceptable
print download_time(10,'MB', 2, 'kb')
#>>> 11 hours, 22 minutes, 40 seconds # 40.0 seconds is also acceptable