David Tsukiyama Home

import seaborn import numpy as np from collections import Counter import matplotlib.pyplot as plt users=Counter(b['review/profileName'] for b in data) values = [count for item, count in users.most_common()[:30]] labels = [item for item, count in users.most_common()[:30]] labels = [removeNonAscii(b) for b in labels] indexes = np.arange(len(labels)) width = 1 plt.bar(indexes, values, width, color='crimson', alpha=0.6) plt.xticks(indexes + width * 0.5, labels, rotation=90) plt.title("Top 30 Users",fontsize=16)

Data Mining: Unstructured Data

The use of APIs and scraping -- Python


from lxml import html
import requests
page = requests.get('http://infoproc.blogspot.jp')
tree = html.fromstring(page.text)
categories = tree.xpath('//a[@dir="ltr"]/text()')
categories=list(categories)

['physics',
 'finance',
 'globalization',
 'genetics',
 'brainpower',
 'photos',
 'technology',
 'economics',
 'science',
 'american society',
 'China',
 'psychometrics',
 'credit crisis',
 'travel',
 'innovation',
 'higher education',
 'human capital',

Example: The New York Times API


bucket='http://api.nytimes.com/svc/community/v3/user-content/recent.json?       
api-key=###&url=http://www.nytimes.com/2015/04/12/opinion/sunday/david-brooks-the-moral-bucket-list.html'  

response = urllib.urlopen(bucket).read()
response_dictionary = json.loads(response)

def parse(mail):

    brooks = []
    for b in mail:
        dic = {}
        dic['comment'] = b['commentBody']
        dic['date'] = b['createDate']
        dic['comment_type'] = b['commentType']
        dic['editorsSelection'] =b['editorsSelection']
        dic['email'] = b['email']
        dic['recommend'] = b['recommendationCount']
        dic['replies'] = b['replies']
        dic['name'] = b['userDisplayName']
        dic['location'] = b['userLocation']
        dic['login'] = b['login']

        brooks.append(dic)
    return brooks`

comments=response_dictionary['results']['comments']
comments=parse(comments)
comments[0]

{'comment': u'The lament of the hollow man who sees but does not understand.',
 'comment_type': u'comment',
 'date': u'1428853503',
 'editorsSelection': 0,
 'email': u'wilkinson.eileen@gmail.com',
 'location': u'Maine',
 'login': None,
 'name': u'Eileen Wilkinson',
 'recommend': 4,
 'replies': []}

Data, simple

David Tsukiyama

Who am I?

Examples:

Traditional econometrics

Data Mining

Data Visualization

Machine Learning

Interactive Visualization

Data Mining: Unstructured Data