Introductory examples 1.usa.gov data from bit.ly
'/Users/imonce/OneDrive/learning/dataAnalyze/pydata-book-master'
1 2 %cd ../pydata-book-master
/Users/imonce/OneDrive/learning/dataAnalyze/pydata-book-master
1 2 path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
1 2 3 4 5 6 7 8 import jsonpath = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt' records = [json.loads(line) for line in open(path)]
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
'al': 'en-US,en;q=0.8',
'c': 'US',
'cy': 'Danvers',
'g': 'A6qOVH',
'gr': 'MA',
'h': 'wfLQtf',
'hc': 1331822918,
'hh': '1.usa.gov',
'l': 'orofrog',
'll': [42.576698, -70.954903],
'nk': 1,
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
't': 1331923247,
'tz': 'America/New_York',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11'
Counting time zones in pure Python
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-8-992e1ec28c8d> in <module>()
1 # 如果查询不存在的key的话会报错
----> 2 records[0]['cc']
KeyError: 'cc'
1 2 3 4 time_zones = [rec['tz' ] for rec in records]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-9-abb6a4fa53e3> in <module>()
2 # [rec['tz'] for rec in records]:把rec中key为‘tz’的value取出来,作为item构建list
3 # 直接运行会报错,因为有的行里边是没有‘tz’这个key的
----> 4 time_zones = [rec['tz'] for rec in records]
<ipython-input-9-abb6a4fa53e3> in <listcomp>(.0)
2 # [rec['tz'] for rec in records]:把rec中key为‘tz’的value取出来,作为item构建list
3 # 直接运行会报错,因为有的行里边是没有‘tz’这个key的
----> 4 time_zones = [rec['tz'] for rec in records]
KeyError: 'tz'
1 2 3 time_zones = [rec['tz' ] for rec in records if 'tz' in rec]
1 2 3 print(len(records),len(time_zones))
3560 3440
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 def get_counts (sequence) : counts = {} for x in sequence: if x in counts: counts[x] += 1 else : counts[x] = 1 return counts
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 from collections import defaultdictdef get_counts2 (sequence) : counts = defaultdict(int) for x in sequence: counts[x] += 1 return counts
1 2 counts = get_counts(time_zones)
1 2 3 counts['America/New_York' ]
1251
1 2 3 [(count, tz) for tz, count in counts.items()]
[(1251, 'America/New_York'),
(191, 'America/Denver'),
(33, 'America/Sao_Paulo'),
(16, 'Europe/Warsaw'),
(521, ''),
(382, 'America/Los_Angeles'),
(10, 'Asia/Hong_Kong'),
(27, 'Europe/Rome'),
(2, 'Africa/Ceuta'),
(35, 'Europe/Madrid'),
(3, 'Asia/Kuala_Lumpur'),
(1, 'Asia/Nicosia'),
(74, 'Europe/London'),
(36, 'Pacific/Honolulu'),
(400, 'America/Chicago'),
(2, 'Europe/Malta'),
(8, 'Europe/Lisbon'),
(14, 'Europe/Paris'),
(5, 'Europe/Copenhagen'),
(1, 'America/Mazatlan'),
(3, 'Europe/Dublin'),
(4, 'Europe/Brussels'),
(12, 'America/Vancouver'),
(22, 'Europe/Amsterdam'),
(10, 'Europe/Prague'),
(14, 'Europe/Stockholm'),
(5, 'America/Anchorage'),
(6, 'Asia/Bangkok'),
(28, 'Europe/Berlin'),
(25, 'America/Rainy_River'),
(5, 'Europe/Budapest'),
(37, 'Asia/Tokyo'),
(6, 'Europe/Vienna'),
(20, 'America/Phoenix'),
(3, 'Asia/Jerusalem'),
(3, 'Asia/Karachi'),
(3, 'America/Bogota'),
(20, 'America/Indianapolis'),
(9, 'America/Montreal'),
(9, 'Asia/Calcutta'),
(1, 'Europe/Skopje'),
(4, 'Asia/Beirut'),
(6, 'Australia/NSW'),
(6, 'Chile/Continental'),
(4, 'America/Halifax'),
(6, 'America/Edmonton'),
(3, 'Europe/Bratislava'),
(2, 'America/Recife'),
(3, 'Africa/Cairo'),
(9, 'Asia/Istanbul'),
(1, 'Asia/Novosibirsk'),
(10, 'Europe/Moscow'),
(1, 'Europe/Sofia'),
(1, 'Europe/Ljubljana'),
(15, 'America/Mexico_City'),
(10, 'Europe/Helsinki'),
(4, 'Europe/Bucharest'),
(4, 'Europe/Zurich'),
(10, 'America/Puerto_Rico'),
(1, 'America/Monterrey'),
(6, 'Europe/Athens'),
(4, 'America/Winnipeg'),
(2, 'Europe/Riga'),
(1, 'America/Argentina/Buenos_Aires'),
(4, 'Asia/Dubai'),
(10, 'Europe/Oslo'),
(1, 'Asia/Yekaterinburg'),
(1, 'Asia/Manila'),
(1, 'America/Caracas'),
(1, 'Asia/Riyadh'),
(1, 'America/Montevideo'),
(1, 'America/Argentina/Mendoza'),
(5, 'Asia/Seoul'),
(1, 'Europe/Uzhgorod'),
(1, 'Australia/Queensland'),
(2, 'Europe/Belgrade'),
(1, 'America/Costa_Rica'),
(1, 'America/Lima'),
(1, 'Asia/Pontianak'),
(2, 'America/Chihuahua'),
(2, 'Europe/Vilnius'),
(3, 'America/Managua'),
(1, 'Africa/Lusaka'),
(2, 'America/Guayaquil'),
(3, 'Asia/Harbin'),
(2, 'Asia/Amman'),
(1, 'Africa/Johannesburg'),
(1, 'America/St_Kitts'),
(11, 'Pacific/Auckland'),
(1, 'America/Santo_Domingo'),
(1, 'America/Argentina/Cordoba'),
(1, 'Asia/Kuching'),
(1, 'Europe/Volgograd'),
(1, 'America/La_Paz'),
(1, 'Africa/Casablanca'),
(3, 'Asia/Jakarta'),
(1, 'America/Tegucigalpa')]
1 2 3 4 5 6 7 8 9 def top_counts (count_dict, n=10 ) : value_key_pairs = [(count, tz) for tz, count in count_dict.items()] value_key_pairs.sort() return value_key_pairs[-n:]
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
1 2 3 from collections import Counter
1 2 counts = Counter(time_zones)
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]