2017年12月

1、PV的案例代码展示

//利用spark程序统计运行商pv总量
object PV {

  def main(args: Array[String]): Unit = {
    // 1、创建sparkConf,设置appName和master
    val sparkConf: SparkConf = new SparkConf().setAppName("PV").setMaster("local[2]")

    // 2、创建sparkContext
    val sc: SparkContext = new SparkContext(sparkConf)

    //设置日志等级
    sc.setLogLevel("WARN")

    //3、读取日志数据
    val dataRDD: RDD[String] = sc.textFile("E:\\data\\access.txt")

    //4、统计PV
    //方法一:
    val dataOne: RDD[(String, Int)] = dataRDD.map(x=>("PV",1))
    val result: RDD[(String, Int)] = dataOne.reduceByKey(_+_)

   result.foreach(println)
    //方法二:
    println("方法二 PV总量: "+dataRDD.count())

    sc.stop()
    
  }

}

2、UV的案例代码展示

//利用spark程序统计运营商uv总量
object UV  extends App{

  //1、创建sparkConf,设置appName和master
  val sparkConf: SparkConf = new SparkConf().setAppName("UV").setMaster("local[2]")

  // 2、创建sparkContext
  val sc: SparkContext = new SparkContext(sparkConf)

  //设置日志等级
  sc.setLogLevel("WARN")

  //3、读取日志数据
  val dataRDD: RDD[String] = sc.textFile("E:\\data\\access.txt")

  // 4、切分每一行,获取对应的ip地址
  val ips: RDD[String] = dataRDD.map(_.split(" ")(0))

  // 5、去重
  val ipNum: Long = ips.distinct().count()

  //6、输出结果
  println("总的UV量: "+ipNum)

  sc.stop()
 
}

3、TopN的案例代码展示

object TopN extends App{
  // 1、创建sparkConf,设置appName和master
  val sparkConf: SparkConf = new SparkConf().setAppName("TopN").setMaster("local[2]")

  // 2、创建sparkContext
  val sc: SparkContext = new SparkContext(sparkConf)

  //设置日志等级
  sc.setLogLevel("WARN")

  //3、读取日志数据
  val dataRDD: RDD[String] = sc.textFile("E:\\data\\access.txt")

  //4、过滤掉缺失的字段的记录,切分每一行  获取url, 每个url记为1

  val urlAndOne: RDD[(String, Int)] = dataRDD.filter(_.split(" ").length>10).map(x=>(x.split(" ")(10),1))
  //过滤 "-"
  val urls: RDD[(String, Int)] = urlAndOne.filter(_._1.size>3)

  // 5、相同url出现的次数累加
  val result: RDD[(String, Int)] = urls.reduceByKey(_+_)

  //6、访问url最多的排序
  val resultSort: RDD[(String, Int)] = result.sortBy(_._2,false)

  // 7、取前五位
  val finalResultSort: Array[(String, Int)] = resultSort.take(5)

  //8、打印输出
  finalResultSort.foreach(println)

  sc.stop()
}

Pom.xml如下所示

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>cn.itcast</groupId>
    <artifactId>Spark</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>18</maven.compiler.target>
        <encoding>UTF-8</encoding>
        <scala.version>2.11.8</scala.version>
        <scala.compat.version>2.11</scala.compat.version>
        <hadoop.version>2.7.4</hadoop.version>
        <spark.version>2.0.2</spark.version>
    </properties>
    <dependencies>
    <dependency>
        <groupId>org.scala-lang</groupId>
        <artifactId>scala-library</artifactId>
        <version>${scala.version}</version>
    </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
    </dependencies>
    <build>
        <sourceDirectory>src/main/scala</sourceDirectory>
        <testSourceDirectory>src/test/scala</testSourceDirectory>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                        <configuration>
                            <args>
                                <arg>-dependencyfile</arg>
                                <arg>${project.build.directory}/.scala_dependencies</arg>
                            </args>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
                <version>2.18.1</version>
                <configuration>
                    <useFile>false</useFile>
                    <disableXmlReport>true</disableXmlReport>
                    <includes>
                        <include>**/*Test.*</include>
                        <include>**/*Suite.*</include>
                    </includes>
                </configuration>
            </plugin>

            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass></mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

资源如下:

access.txt