一、方案 1:业务代码埋点(Java + Spring Boot + 远程指标推送)
1. 依赖引入(pom.xml,完整备注)
xml
<dependencies> <!-- Spring Boot Web核心依赖:用于开发HTTP接口,处理登录/操作请求 --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> <version>3.2.0</version> <!-- 稳定版本,兼容Spring Boot 3.x生态 --> </dependency> <!-- Prometheus Spring Boot集成依赖:自动暴露/rometheus指标端点,无需手动编写接口 --> <dependency> <groupId>io.prometheus</groupId> <artifactId>simpleclient_spring_boot</artifactId> <version>0.16.0</version> </dependency> <!-- Prometheus Hotspot依赖:采集JVM指标(内存、GC、线程),补充系统监控维度 --> <dependency> <groupId>io.prometheus</groupId> <artifactId>simpleclient_hotspot</artifactId> <version>0.16.0</version> </dependency> </dependencies>2. 业务代码(完整埋点 + 备注)
java
import io.prometheus.client.Counter; import io.prometheus.client.spring.boot.EnablePrometheusEndpoint; // 启用Prometheus指标端点 import io.prometheus.client.spring.boot.EnableSpringBootMetricsCollector; // 收集Spring Boot内置指标(如Tomcat、JVM) import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RestController; import java.util.Map; @SpringBootApplication // 核心注解:标记Spring Boot应用,自动配置组件扫描、属性绑定等 @EnablePrometheusEndpoint // 暴露/rometheus HTTP接口,供Prometheus采集指标 @EnableSpringBootMetricsCollector // 整合Spring Boot内置指标(如http.server.requests、jvm.memory.used) @RestController // 标记为控制器:所有方法直接映射为HTTP接口,无需额外配置 public class BusinessApplication { // 定义登录次数计数器:Counter类型(只增不减),用于统计累计登录行为 // name:指标名(符合Prometheus规范:小写+下划线,避免特殊字符) // help:指标描述(便于后期监控面板理解含义) // labelNames:标签维度(用于分组统计,如登录结果、客户端类型) private static final Counter LOGIN_COUNTER = Counter.build() .name("user_login_total") .help("Total number of user login attempts (success/failed, pc/mobile)") .labelNames("result", "client_type") .register(); // 注册到Prometheus全局注册表:确保指标能被采集端点发现 // 定义操作次数计数器:统计用户不同类型的业务操作(查询/下单/退款) private static final Counter OPERATION_COUNTER = Counter.build() .name("user_operation_total") .help("Total number of user business operations (query/order/refund)") .labelNames("operation_type") .register(); public static void main(String[] args) { SpringApplication.run(BusinessApplication.class, args); // 启动Spring Boot应用:加载上下文、启动内嵌Tomcat } // 登录接口:POST请求,路径/login,接收JSON格式请求体 @PostMapping("/login") public ResponseEntity<?> login(@RequestBody Map<String, String> request) { // 从请求体提取参数:用户名、密码(必填),客户端类型(选填,默认unknown) String username = request.get("username"); String password = request.get("password"); String clientType = request.getOrDefault("client_type", "unknown"); // 模拟登录校验逻辑:固定账号密码(实际项目需对接数据库/认证服务) if ("admin".equals(username) && "123456".equals(password)) { // 登录成功:更新计数器,标签为result=success、client_type=客户端类型 LOGIN_COUNTER.labels("success", clientType).inc(); // inc():计数器+1 return ResponseEntity.ok(Map.of("code", 200, "msg", "登录成功")); // 返回200 OK响应 } else { // 登录失败:更新计数器,标签为result=failed、client_type=客户端类型 LOGIN_COUNTER.labels("failed", clientType).inc(); return ResponseEntity.status(401).body(Map.of("code", 401, "msg", "账号密码错误")); // 返回401未授权响应 } } // 业务操作接口:POST请求,路径/operation,接收JSON格式请求体 @PostMapping("/operation") public ResponseEntity<?> operation(@RequestBody Map<String, String> request) { // 从请求体提取操作类型(选填,默认unknown) String operationType = request.getOrDefault("operation_type", "unknown"); // 更新操作计数器:按操作类型分组统计 OPERATION_COUNTER.labels(operationType).inc(); return ResponseEntity.ok(Map.of("code", 200, "msg", "操作成功")); // 返回200 OK响应 } }3. Dockerfile(完整构建逻辑 + 备注)
dockerfile
# 基础镜像:OpenJDK 17 slim版(基于Alpine Linux,体积小≈200MB,适合生产环境) FROM openjdk:17-jdk-slim # 设置工作目录:后续命令均在该目录下执行,避免文件混乱 WORKDIR /app # 复制Maven打包后的jar包到镜像中:target目录是Maven默认打包输出路径,app.jar是自定义名称 COPY target/business-app-0.0.1-SNAPSHOT.jar app.jar # 暴露业务端口:与Spring Boot配置的server.port一致(默认8080),仅声明用途,不实际映射 EXPOSE 8080 # 容器启动命令:执行jar包,-jar指定运行的jar文件,/app/app.jar是容器内的绝对路径 ENTRYPOINT ["java", "-jar", "/app/app.jar"]4. K8s Pod 配置(业务 Pod + Prometheus 采集 + 远程推送,完整备注)
yaml
# business-pod.yaml:业务Pod定义(运行Spring Boot应用) apiVersion: v1 # K8s API版本:Pod属于核心组,版本为v1 kind: Pod # 资源类型:Pod(最小部署单元,包含一个或多个容器) metadata: name: business-pod # Pod名称:唯一标识,用于kubectl操作(如kubectl logs business-pod) labels: app: business-app # Pod标签:用于Service关联、Prometheus服务发现、Deployment管理 spec: containers: - name: business-container # 容器名称:Pod内唯一,用于区分多容器 image: your-business-image:latest # 业务镜像地址:需替换为实际镜像仓库(如Harbor/Docker Hub) ports: - containerPort: 8080 # 容器端口:与业务端口一致,供集群内其他服务访问 resources: # 资源限制:避免容器占用过多节点资源,导致其他应用饥饿 limits: # 最大资源限制:容器最多使用500m CPU(0.5核)、512Mi内存 cpu: 500m memory: 512Mi requests: # 初始资源请求:调度器根据该值将Pod调度到有足够资源的节点(200m CPU、256Mi内存) cpu: 200m memory: 256Mi --- # prometheus-config.yaml:Prometheus配置(含采集业务指标+远程推送) apiVersion: v1 kind: ConfigMap # 资源类型:ConfigMap(存储配置文件,避免硬编码到镜像) metadata: name: prometheus-config # ConfigMap名称:供Prometheus Pod挂载使用 namespace: monitoring # 命名空间:Prometheus通常部署在monitoring命名空间,隔离监控组件 data: prometheus.yml: | # Prometheus主配置文件:YAML格式,包含全局配置、采集任务、远程推送 global: scrape_interval: 10s # 全局采集间隔:所有采集任务默认每10秒采集一次指标 evaluation_interval: 10s # 规则评估间隔:每10秒评估一次告警规则/记录规则 # 远程写配置:将采集的指标推送到远程监控系统(如Grafana Cloud、自建Prometheus集群) remote_write: - url: "https://remote-prom.example.com/api/v1/write" # 远程接收地址:替换为实际的Prometheus Remote Write接口 # 基础认证:远程服务需要的用户名/密码(如Grafana Cloud的API Key) basic_auth: username: "your-remote-username" # 远程服务用户名(如Grafana Cloud的Instance ID) password: "your-remote-api-key" # 远程服务密码/API Key(需妥善保管,建议用Secret挂载) # 队列配置:优化指标推送效率,避免网络波动导致数据丢失 queue_config: max_samples_per_send: 1000 # 每次发送的最大指标样本数:减少请求次数,提高效率 max_shards: 200 # 最大并发分片数:多个分片并行推送,提升吞吐量 capacity: 50000 # 队列容量:缓存指标样本,网络故障时暂存数据(避免丢失) batch_send_deadline: 5s # 批处理发送超时:5秒内必须发送,保证实时性 # 压缩配置:开启Snappy压缩,减少网络传输体积(压缩率≈50%) compression: snappy # 重试配置:失败时自动重试,保证数据可靠性 retry_on_http_error: true retry_interval: 5s # 重试间隔:5秒重试一次 max_retries: 10 # 最大重试次数:失败10次后放弃(避免无限重试) # 采集任务:抓取业务Pod的指标 scrape_configs: - job_name: 'business-app-metrics' # 采集任务名称:自定义,用于区分不同任务 kubernetes_sd_configs: # K8s服务发现:自动发现集群内的Pod,无需手动配置目标地址 - role: pod # 发现角色:Pod(发现所有Pod) namespaces: names: ["default"] # 限定命名空间:只发现default命名空间的Pod(业务Pod所在namespace) # 重标记规则:过滤目标、修改标签,精准定位业务Pod relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app] # 来源标签:Pod的app标签 regex: business-app # 匹配规则:只保留app=business-app的Pod(过滤其他Pod) action: keep # 动作:保留匹配的目标(删除不匹配的目标) - source_labels: [__address__] # 来源标签:Pod的地址(格式:PodIP:容器端口) target_label: __address__ # 目标标签:修改采集地址 replacement: '${1}:8080' # 替换规则:将地址改为PodIP:8080(业务容器端口) - source_labels: [__meta_kubernetes_pod_name] # 来源标签:Pod名称 target_label: pod_name # 目标标签:添加pod_name标签,便于监控面板筛选 scrape_path: /prometheus # 采集路径:Spring Boot暴露的指标端点(由simpleclient_spring_boot提供) scrape_interval: 5s # 采集间隔:覆盖全局配置,业务指标5秒采集一次(提高实时性) scrape_timeout: 3s # 采集超时:3秒内未响应则放弃,避免阻塞二、方案 2:日志解析采集(Telegraf Sidecar + 远程推送日志 / 指标)
1. 业务日志配置(logback.xml,完整结构化日志 + 备注)
xml
<?xml version="1.0" encoding="UTF-8"?> <configuration> <!-- 配置1:JSON文件输出Appender(将日志写入文件并格式化为JSON) --> <appender name="JSON_FILE" class="ch.qos.logback.core.FileAppender"> <file>/var/log/business/app.log</file> <!-- 日志文件路径:容器内路径,需与K8s挂载目录一致 --> <immediateFlush>true</immediateFlush> <!-- 实时刷盘:关闭操作系统文件缓存,日志立即写入磁盘(避免丢失) --> <append>true</append> <!-- 追加模式:日志追加到文件末尾,不覆盖历史数据 --> <encoder class="net.logstash.logback.encoder.LogstashEncoder"> <!-- JSON编码器:将日志转为JSON格式 --> <!-- 包含MDC字段:MDC(线程上下文)存储的业务字段,会被写入JSON日志 --> <includeMdcKeyName>event</includeMdcKeyName> <!-- 事件类型:login/operation(区分业务行为) --> <includeMdcKeyName>result</includeMdcKeyName> <!-- 结果:success/failed(登录结果) --> <includeMdcKeyName>client_type</includeMdcKeyName> <!-- 客户端类型:pc/mobile/unknown --> <includeMdcKeyName>operation_type</includeMdcKeyName> <!-- 操作类型:query/order/refund --> <!-- 自定义JSON字段名:避免与Elasticsearch内置字段冲突 --> <fieldNames> <timestamp>time</timestamp> <!-- 时间字段:默认@timestamp改为time,更直观 --> <message>msg</message> <!-- 日志内容字段:默认message改为msg --> <logger>logger_name</logger> <!-- 日志器名称字段:默认logger改为logger_name --> <level>log_level</level> <!-- 日志级别字段:默认level改为log_level --> </fieldNames> <!-- 自定义时间格式:统一时区为UTC+8,格式为yyyy-MM-dd HH:mm:ss.SSS --> <timestampPattern>yyyy-MM-dd HH:mm:ss.SSS</timestampPattern> </encoder> </appender> <!-- 配置2:控制台输出Appender(便于调试,生产环境可关闭) --> <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> <encoder class="net.logstash.logback.encoder.LogstashEncoder"> <!-- 控制台也输出JSON格式,便于容器日志采集 --> <includeMdcKeyName>event</includeMdcKeyName> <includeMdcKeyName>result</includeMdcKeyName> <includeMdcKeyName>client_type</includeMdcKeyName> <includeMdcKeyName>operation_type</includeMdcKeyName> </encoder> </appender> <!-- 根日志配置:全局日志级别+关联Appender --> <root level="INFO"> <!-- 日志级别:INFO(输出INFO及以上级别日志,过滤DEBUG) --> <appender-ref ref="JSON_FILE" /> <!-- 写入JSON文件 --> <appender-ref ref="CONSOLE" /> <!-- 输出到控制台(调试用) --> </root> </configuration>2. 业务代码(MDC 标记日志 + 备注)
java
import org.slf4j.MDC; // MDC(Mapped Diagnostic Context):线程级别的日志上下文工具 import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestBody; import org.springframework.web.bind.annotation.RestController; import java.util.Map; @RestController // 标记为控制器,处理HTTP请求 public class BusinessController { // 登录接口:添加MDC标记,生成结构化日志 @PostMapping("/login") public ResponseEntity<?> login(@RequestBody Map<String, String> request) { // 1. 提取请求参数:客户端类型(选填,默认unknown) String clientType = request.getOrDefault("client_type", "unknown"); // 2. 向MDC放入业务字段:这些字段会被logback写入JSON日志 MDC.put("event", "login"); // 标记事件类型为登录 MDC.put("client_type", clientType); // 标记客户端类型 // 3. 模拟登录逻辑 try { if ("admin".equals(request.get("username")) && "123456".equals(request.get("password"))) { MDC.put("result", "success"); // 标记登录成功 return ResponseEntity.ok(Map.of("code", 200, "msg", "登录成功")); } else { MDC.put("result", "failed"); // 标记登录失败 return ResponseEntity.status(401).body(Map.of("code", 401, "msg", "账号密码错误")); } } finally { // 4. 清空MDC:避免线程复用导致字段污染(Tomcat线程池会复用线程) MDC.clear(); } } // 业务操作接口:添加MDC标记,生成结构化日志 @PostMapping("/operation") public ResponseEntity<?> operation(@RequestBody Map<String, String> request) { // 1. 提取请求参数:操作类型(选填,默认unknown) String operationType = request.getOrDefault("operation_type", "unknown"); // 2. 向MDC放入业务字段 MDC.put("event", "operation"); // 标记事件类型为业务操作 MDC.put("operation_type", operationType); // 标记操作类型 // 3. 模拟业务操作逻辑 try { return ResponseEntity.ok(Map.of("code", 200, "msg", "操作成功")); } finally { // 4. 清空MDC:避免线程复用污染 MDC.clear(); } } }3. Telegraf 配置(含远程推送 Elasticsearch/Prometheus,完整备注)
yaml
# telegraf-config.yaml:Telegraf配置(日志解析+指标生成+远程推送) apiVersion: v1 kind: ConfigMap metadata: name: telegraf-config # ConfigMap名称:供Telegraf Sidecar挂载 namespace: default # 命名空间:与业务Pod同namespace,便于挂载 data: telegraf.conf: | # Telegraf主配置文件:TOML格式,包含agent、inputs、outputs [agent] interval = "10s" # 采集间隔:每10秒采集一次日志 round_interval = true # 对齐采集时间:如整10秒、20秒采集(避免时间碎片化) metric_batch_size = 1000 # 批量发送指标大小:每次发送1000个指标样本 metric_buffer_limit = 10000 # 指标缓冲区大小:最多缓存10000个样本(避免内存溢出) collection_jitter = "1s" # 采集抖动:随机延迟1秒,避免所有采集任务同时执行(减轻服务端压力) flush_interval = "10s" # 刷新间隔:每10秒推送一次指标/日志 flush_jitter = "1s" # 刷新抖动:随机延迟1秒,避免峰值流量 logtarget = "file" # 日志输出目标:文件(便于调试) logfile = "/var/log/telegraf/telegraf.log" # Telegraf自身日志路径 loglevel = "info" # 日志级别:INFO(过滤DEBUG) # 输入插件:tail(监听日志文件变化,类似Linux tail -f命令) [[inputs.tail]] files = ["/var/log/business/app.log"] # 监听的日志文件路径:与业务容器共享的目录 from_beginning = false # 是否从文件开头读取:false(只采集新增日志,避免重复采集历史数据) follow_rename = true # 跟踪文件重命名:日志轮转(logrotate)后继续监听新文件 max_undelivered_lines = 10000 # 最大未投递行数:缓存10000行未发送的日志(避免丢失) data_format = "json" # 数据格式:JSON(解析业务日志的JSON格式) json_query = "" # JSON查询:空(解析整个JSON对象,无需过滤) json_time_key = "time" # 时间字段:日志中的time字段(对应logback的timestamp) json_time_format = "2006-01-02 15:04:05.000" # 时间格式:Go语言时间格式,需与logback一致 tag_keys = ["event", "result", "client_type", "operation_type"] # 标签字段:将日志中的这些字段转为指标标签 # 日志过滤:只保留event为login或operation的日志(过滤无关日志) [inputs.tail.json_filter] event = ["login", "operation"] # 输出插件1:prometheus_client(暴露指标接口,供本地Prometheus采集) [[outputs.prometheus_client]] listen = ":9273" # 监听地址:0.0.0.0:9273(容器内端口,供Prometheus访问) metric_version = 2 # Prometheus指标版本:v2(兼容最新Prometheus) namespace = "business" # 指标前缀:business_(避免与其他指标冲突) timeout = "3s" # 超时时间:3秒内未响应则放弃 max_tcp_connections = 250 # 最大TCP连接数:支持250个并发采集请求 # 输出插件2:elasticsearch(推送日志到Elasticsearch,用于日志检索/可视化) [[outputs.elasticsearch]] urls = ["https://es-cluster.example.com:9200"] # Elasticsearch集群地址:替换为实际地址(支持多个节点,用逗号分隔) index_name = "business-logs-%Y.%m.%d" # 索引名称:按日期分索引(如business-logs-2025.05.20),便于管理/删除 document_type = "_doc" # 文档类型:Elasticsearch 7.x+默认_doc,无需修改 # 认证配置:Elasticsearch的用户名/密码(如启用X-Pack安全认证) username = "es-admin" password = "es-password" # 索引模板:自动创建索引模板,定义字段类型(避免字段类型冲突) manage_template = true # 自动管理模板:true(Telegraf自动创建模板) template_name = "business-logs-template" # 模板名称:自定义 template_pattern = "business-logs-*" # 模板匹配模式:匹配所有业务日志索引 # 批量配置:优化写入性能 bulk_max_size = 1000 # 批量写入大小:每次写入1000条日志 flush_interval = "10s" # 刷新间隔:10秒写入一次 # 重试配置:保证数据可靠性 retry_limit = 3 # 重试次数:失败后重试3次 retry_interval = "5s" # 重试间隔:5秒重试一次 # 压缩配置:开启gzip压缩,减少网络传输体积 compression = "gzip" # 其他配置: timeout = "10s" # 超时时间:10秒 health_check_interval = "30s" # 健康检查间隔:30秒检查一次Elasticsearch状态 # 输出插件3:prometheus_remote_write(推送指标到远程Prometheus) [[outputs.prometheus_remote_write]] url = "https://remote-prom.example.com/api/v1/write" # 远程Prometheus的Remote Write接口 # 认证配置:基础认证(用户名/密码) basic_auth = {username = "remote-prom-username", password = "remote-prom-api-key"} # 指标过滤:只推送业务相关指标(避免推送Telegraf自身指标) namepass = ["business_*"] # 只推送以business_为前缀的指标 # 批量配置: batch_size = 1000 # 批量发送大小:每次发送1000个指标样本 flush_interval = "10s" # 刷新间隔:10秒推送一次 # 重试配置: max_retries = 5 # 最大重试次数:5次 retry_interval = "5s" # 重试间隔:5秒 # 压缩配置:开启Snappy压缩 compression = "snappy" # 超时配置: timeout = "5s" # 超时时间:5秒4. K8s Pod 配置(业务 + Telegraf Sidecar,完整备注)
yaml
# business-pod-sidecar.yaml:包含业务容器和Telegraf Sidecar的Pod apiVersion: v1 kind: Pod metadata: name: business-pod-sidecar # Pod名称 labels: app: business-app # Pod标签:用于Prometheus服务发现 spec: restartPolicy: Always # 重启策略:Always(容器退出时自动重启,保证可用性) containers: # 容器1:业务容器(运行Spring Boot应用) - name: business-container image: your-business-image:latest # 业务镜像地址:替换为实际地址 volumeMounts: - name: log-volume # 挂载共享日志卷:与Telegraf Sidecar共享日志目录 mountPath: /var/log/business # 容器内日志路径:与logback.xml中的配置一致 - name: telegraf-log-volume # 挂载Telegraf日志卷:存储Telegraf自身日志(便于调试) mountPath: /var/log/telegraf resources: # 资源限制:避免占用过多节点资源 limits: cpu: 500m # 最大CPU:500毫核 memory: 512Mi # 最大内存:512MB requests: cpu: 200m # 初始CPU请求:200毫核 memory: 256Mi # 初始内存请求:256MB readinessProbe: # 就绪探针:检查应用是否就绪(接收请求) httpGet: path: /actuator/health # 健康检查接口(需添加spring-boot-starter-actuator依赖) port: 8080 initialDelaySeconds: 30 # 初始延迟:30秒后开始检查 periodSeconds: 10 # 检查间隔:每10秒检查一次 timeoutSeconds: 3 # 超时时间:3秒 livenessProbe: # 存活探针:检查应用是否存活(未存活则重启容器) httpGet: path: /actuator/health port: 8080 initialDelaySeconds: 60 # 初始延迟:60秒后开始检查 periodSeconds: 20 # 检查间隔:每20秒检查一次 timeoutSeconds: 3 # 超时时间:3秒 # 容器2:Telegraf Sidecar(日志解析+指标生成+远程推送) - name: telegraf-sidecar image: telegraf:1.28-alpine # Telegraf镜像:1.28-alpine(轻量版,体积≈50MB) volumeMounts: - name: log-volume # 挂载共享日志卷:读取业务日志 mountPath: /var/log/business - name: telegraf-config-volume # 挂载Telegraf配置卷:加载telegraf.conf mountPath: /etc/telegraf/telegraf.conf subPath: telegraf.conf # 只挂载telegraf.conf文件(不是整个目录) - name: telegraf-log-volume # 挂载Telegraf日志卷:存储自身日志 mountPath: /var/log/telegraf ports: - containerPort: 9273 # Telegraf指标端口:供Prometheus采集 name: metrics # 端口名称:便于识别 resources: # 资源限制:Sidecar资源需求较低 limits: cpu: 100m # 最大CPU:100毫核 memory: 128Mi # 最大内存:128MB requests: cpu: 50m # 初始CPU请求:50毫核 memory: 64Mi # 初始内存请求:64MB readinessProbe: # 就绪探针:检查Telegraf是否就绪 tcpSocket: port: 9273 # 检查9273端口是否监听 initialDelaySeconds: 10 # 初始延迟:10秒后开始检查 periodSeconds: 5 # 检查间隔:每5秒检查一次 # 卷定义: volumes: - name: log-volume # 共享日志卷:emptyDir(临时存储,Pod删除后数据丢失,生产环境可用PersistentVolume) emptyDir: medium: Memory # 存储介质:内存(提高读写性能,可选) sizeLimit: 1Gi # 大小限制:1GB(避免占用过多内存) - name: telegraf-config-volume # Telegraf配置卷:关联ConfigMap configMap: name: telegraf-config # 关联的ConfigMap名称 items: - key: telegraf.conf # ConfigMap中的key path: telegraf.conf # 挂载到容器内的文件名 - name: telegraf-log-volume # Telegraf日志卷:emptyDir emptyDir: sizeLimit: 512Mi # 大小限制:512MB三、方案 3:eBPF 采集(DeepFlow + 远程指标推送,完整配置 + 备注)
1. DeepFlow 命名空间与 ServiceAccount(权限配置 + 备注)
yaml
# deepflow-namespace.yaml:创建DeepFlow专属命名空间 apiVersion: v1 kind: Namespace metadata: name: deepflow # 命名空间名称:隔离DeepFlow组件 labels: name: deepflow # 命名空间标签:便于识别 --- # deepflow-serviceaccount.yaml:创建ServiceAccount(赋予DeepFlow Agent必要的权限) apiVersion: v1 kind: ServiceAccount metadata: name: deepflow-agent # ServiceAccount名称 namespace: deepflow # 命名空间:deepflow --- # deepflow-clusterrole.yaml:创建ClusterRole(集群级权限,用于访问K8s API) apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: deepflow-agent-role # ClusterRole名称 rules: - apiGroups: [""] # API组:核心组(Pods、Nodes、Services等) resources: ["pods", "nodes", "services", "endpoints", "namespaces"] # 资源类型:需要访问的K8s资源 verbs: ["get", "list", "watch"] # 操作权限:获取、列表、监听(只读,保证安全) - apiGroups: ["networking.k8s.io"] # API组:网络组 resources: ["ingresses"] # 资源类型:Ingress(用于采集Ingress流量) verbs: ["get", "list", "watch"] --- # deepflow-clusterrolebinding.yaml:绑定ClusterRole到ServiceAccount apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: deepflow-agent-binding # 绑定名称 subjects: - kind: ServiceAccount name: deepflow-agent # ServiceAccount名称 namespace: deepflow # 命名空间:deepflow roleRef: kind: ClusterRole name: deepflow-agent-role # ClusterRole名称 apiGroup: rbac.authorization.k8s.io # API组:rbac.authorization.k8s.io2. DeepFlow Agent DaemonSet 配置(含远程推送,完整备注)
yaml
# deepflow-agent-daemonset.yaml:DaemonSet部署DeepFlow Agent(每个节点一个Pod) apiVersion: apps/v1 kind: DaemonSet # 资源类型:DaemonSet(每个节点运行一个Pod,适合节点级采集) metadata: name: deepflow-agent # DaemonSet名称 namespace: deepflow # 命名空间:deepflow labels: app: deepflow-agent # 标签:便于筛选 spec: selector: matchLabels: app: deepflow-agent # 选择器:匹配标签为app=deepflow-agent的Pod updateStrategy: type: RollingUpdate # 更新策略:滚动更新(逐个节点更新,避免服务中断) rollingUpdate: maxUnavailable: 1 # 最大不可用数:更新时最多1个Pod不可用 maxSurge: 0 # 最大激增数:不创建额外Pod(DaemonSet每个节点只能有一个Pod) template: metadata: labels: app: deepflow-agent # Pod标签:与选择器匹配 spec: serviceAccountName: deepflow-agent # 关联ServiceAccount:赋予K8s API访问权限 hostNetwork: true # 使用主机网络:直接使用节点的网络命名空间,便于捕获所有流量(包括主机和容器) hostPID: true # 共享主机PID命名空间:访问主机进程信息,便于关联容器和进程 hostIPC: true # 共享主机IPC命名空间:用于eBPF程序通信 privileged: true # 特权模式:eBPF需要修改内核参数、访问主机设备,必须开启 tolerations: # 容忍污点:允许部署到master节点(采集master节点流量) - key: node-role.kubernetes.io/master effect: NoSchedule - key: node-role.kubernetes.io/control-plane effect: NoSchedule containers: - name: deepflow-agent # 容器名称 image: deepflowce/agent:latest # DeepFlow镜像:社区版latest(生产环境建议指定版本,如v6.3.0) imagePullPolicy: Always # 镜像拉取策略:Always(每次启动拉取最新镜像,便于更新) securityContext: privileged: true # 特权模式:必须开启 capabilities: # 添加Linux能力:增强容器权限(eBPF需要) add: ["SYS_ADMIN", "SYS_RESOURCE", "NET_ADMIN", "NET_RAW", "IPC_LOCK"] env: # 1. DeepFlow Server配置:Agent连接的Server地址(接收采集的数据) - name: DEEPFLOW_SERVER_IP value: "deepflow-server.deepflow.svc" # Server的Service地址(集群内),替换为实际地址 - name: DEEPFLOW_SERVER_PORT value: "30033" # Server的端口:默认30033 # 2. Agent标识配置: - name: DEEPFLOW_AGENT_NAME valueFrom: fieldRef: fieldPath: spec.nodeName # 使用节点名作为Agent名称:便于识别每个节点的Agent - name: DEEPFLOW_AGENT_GROUP value: "k8s-cluster" # Agent分组:自定义,便于管理多个集群 # 3. eBPF配置: - name: DEEPFLOW_EBPF_ENABLE value: "true" # 启用eBPF:true(核心采集方式) - name: DEEPFLOW_EBPF_KERNEL_VERSION_AUTO_DETECT value: "true" # 自动检测内核版本:true(适配不同节点的内核) # 4. 远程推送配置:将指标推送到远程Prometheus - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_URL value: "https://remote-prom.example.com/api/v1/write" # 远程Prometheus的Remote Write接口 - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_USERNAME value: "remote-prom-username" # 远程服务用户名 - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_PASSWORD value: "remote-prom-api-key" # 远程服务API Key/密码 - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_INTERVAL value: "5s" # 推送间隔:5秒(提高实时性) - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_BATCH_SIZE value: "1000" # 批量大小:1000个指标样本 # 5. 日志配置: - name: DEEPFLOW_LOG_LEVEL value: "info" # 日志级别:info - name: DEEPFLOW_LOG_FILE value: "/var/log/deepflow/agent.log" # Agent日志路径 volumeMounts: # 1. 主机根目录:访问内核文件、eBPF程序、容器运行时(如Docker/containerd) - name: host-root mountPath: /host readOnly: true # 只读挂载:避免修改主机文件 # 2. 主机/dev目录:访问网络设备(如eth0)、eBPF映射 - name: host-dev mountPath: /dev # 3. 主机/sys目录:访问内核参数、eBPF文件系统 - name: host-sys mountPath: /sys readOnly: true # 4. 主机/var/run目录:访问容器运行时套接字(如/var/run/docker.sock) - name: host-var-run mountPath: /var/run readOnly: true # 5. Agent日志目录:存储自身日志 - name: deepflow-log mountPath: /var/log/deepflow # 6. eBPF缓存目录:存储eBPF程序和映射 - name: deepflow-ebpf mountPath: /var/lib/deepflow/ebpf resources: # 资源限制:eBPF Agent资源需求较低 limits: cpu: 1000m # 最大CPU:1核(处理高流量时可能需要更多) memory: 512Mi # 最大内存:512MB requests: cpu: 100m # 初始CPU请求:100毫核 memory: 64Mi # 初始内存请求:64MB readinessProbe: # 就绪探针:检查Agent是否连接到Server exec: command: ["/usr/bin/deepflow-agent", "healthcheck"] # 执行健康检查命令 initialDelaySeconds: 10 # 初始延迟:10秒后开始检查 periodSeconds: 5 # 检查间隔:每5秒检查一次 timeoutSeconds: 3 # 超时时间:3秒 livenessProbe: # 存活探针:检查Agent是否存活 exec: command: ["/usr/bin/deepflow-agent", "status"] # 执行状态检查命令 initialDelaySeconds: 30 # 初始延迟:30秒后开始检查 periodSeconds: 10 # 检查间隔:每10秒检查一次 timeoutSeconds: 3 # 超时时间:3秒 volumes: # 1. 主机根目录卷: - name: host-root hostPath: path: / # 主机根目录 type: Directory # 2. 主机/dev目录卷: - name: host-dev hostPath: path: /dev type: Directory # 3. 主机/sys目录卷: - name: host-sys hostPath: path: /sys type: Directory # 4. 主机/var/run目录卷: - name: host-var-run hostPath: path: /var/run type: Directory # 5. Agent日志卷:emptyDir(临时存储,生产环境可用PersistentVolume) - name: deepflow-log emptyDir: sizeLimit: 1Gi # 大小限制:1GB # 6. eBPF缓存卷:emptyDir - name: deepflow-ebpf emptyDir: sizeLimit: 256Mi # 大小限制:256MB3. DeepFlow Server 配置(含远程推送,完整备注)
yaml
# deepflow-server-statefulset.yaml:StatefulSet部署DeepFlow Server(集群化部署,保证稳定性) apiVersion: apps/v1 kind: StatefulSet # 资源类型:StatefulSet(有状态应用,适合数据库、中间件) metadata: name: deepflow-server # StatefulSet名称 namespace: deepflow # 命名空间:deepflow spec: serviceName: "deepflow-server" # Service名称:用于StatefulSet的DNS解析(如deepflow-server-0.deepflow-server.deepflow.svc) replicas: 1 # 副本数:生产环境建议3个(高可用) selector: matchLabels: app: deepflow-server # 选择器:匹配标签为app=deepflow-server的Pod template: metadata: labels: app: deepflow-server # Pod标签:与选择器匹配 spec: containers: - name: deepflow-server # 容器名称 image: deepflowce/server:latest # Server镜像:社区版latest ports: - containerPort: 30033 # 数据接收端口:Agent连接的端口 - containerPort: 30034 # HTTP API端口:用于查询指标/日志 - containerPort: 9090 # Prometheus指标端口:暴露Server自身指标 env: # 1. 集群配置: - name: DEEPFLOW_CLUSTER_ROLE value: "master" # 集群角色:master(单节点部署) - name: DEEPFLOW_CLUSTER_NODES value: "deepflow-server-0.deepflow-server.deepflow.svc:30033" # 集群节点:StatefulSet的DNS地址 # 2. 存储配置: - name: DEEPFLOW_STORAGE_TYPE value: "local" # 存储类型:local(本地存储,生产环境建议使用MySQL/PostgreSQL) - name: DEEPFLOW_STORAGE_LOCAL_PATH value: "/var/lib/deepflow/storage" # 本地存储路径 # 3. 远程推送配置:将指标推送到远程Prometheus - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_ENABLE value: "true" # 启用远程推送:true - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_URL value: "https://remote-prom.example.com/api/v1/write" # 远程Prometheus地址 - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_USERNAME value: "remote-prom-username" # 用户名 - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_PASSWORD value: "remote-prom-api-key" # API Key - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_INTERVAL value: "5s" # 推送间隔:5秒 - name: DEEPFLOW_PROMETHEUS_REMOTE_WRITE_METRIC_PREFIX value: "deepflow_" # 指标前缀:deepflow_(区分其他指标) volumeMounts: - name: deepflow-storage # 存储卷:存储指标/日志数据 mountPath: /var/lib/deepflow/storage resources: # 资源限制:Server需要较多资源(处理采集的数据) limits: cpu: 2000m # 最大CPU:2核 memory: 2Gi # 最大内存:2GB requests: cpu: 500m # 初始CPU请求:500毫核 memory: 1Gi # 初始内存请求:1GB volumeClaimTemplates: # 持久化存储声明模板:StatefulSet自动创建PVC(PersistentVolumeClaim) - metadata: name: deepflow-storage # PVC名称 spec: accessModes: [ "ReadWriteOnce" ] # 访问模式:单节点读写(生产环境建议ReadWriteMany) resources: requests: storage: 10Gi # 存储请求:10GB(生产环境建议更大,如100GB) --- # deepflow-server-service.yaml:Service暴露DeepFlow Server(供Agent和外部访问) apiVersion: v1 kind: Service metadata: name: deepflow-server # Service名称 namespace: deepflow # 命名空间:deepflow spec: selector: app: deepflow-server # 选择器:匹配Server Pod ports: - port: 30033 targetPort: 30033 name: data # 数据端口名称 - port: 30034 targetPort: 30034 name: api # API端口名称 - port: 9090 targetPort: 9090 name: metrics # 指标端口名称 clusterIP: None # Headless Service:用于StatefulSet的DNS解析(无集群IP,直接解析到Pod IP)四、方案 4:Istio 全量数据采集(远程指标 / 日志推送 + 完整配置)
一、核心架构说明
Istio 通过 Envoy Sidecar 实现流量劫持与数据采集,指标默认由 Prometheus 抓取,日志可通过 Fluent Bit 推送到远端存储(如 Elasticsearch),追踪数据对接 Jaeger。本方案覆盖指标远程写入、日志结构化推送、全链路追踪三大核心能力,适配生产级部署需求。
二、完整配置清单
1. Istio 控制平面配置(istio-config.yaml)
yaml
apiVersion: install.istio.io/v1alpha1 kind: IstioOperator metadata: name: istio-controlplane namespace: istio-system spec: profile: default meshConfig: # 全局流量配置 accessLogFile: /dev/stdout # 开启Sidecar访问日志 accessLogEncoding: JSON # 日志结构化输出 defaultConfig: proxyMetadata: # 开启Envoy指标暴露 ISTIO_META_PROMETHEUS_ANNOTATIONS: "true" tracing: # 对接Jaeger追踪 zipkin: address: jaeger-collector.jaeger-system.svc.cluster.local:9411 components: pilot: k8s: resources: requests: cpu: 200m memory: 256Mi ingressGateways: - name: istio-ingressgateway enabled: true k8s: service: type: LoadBalancer # 生产环境建议用NodePort/Ingress values: global: proxy: resources: requests: cpu: 100m memory: 128Mi # 指标远程写入配置(对接Prometheus Remote Write) prometheus: enabled: false # 关闭内置Prometheus,改用外部采集2. 访问日志结构化配置(istio-accesslog.yaml)
yaml
apiVersion: networking.istio.io/v1alpha3 kind: EnvoyFilter metadata: name: accesslog-json-config namespace: istio-system spec: configPatches: - applyTo: NETWORK_FILTER match: context: SIDECAR_INBOUND listener: filterChain: filter: name: "envoy.filters.network.http_connection_manager" patch: operation: MERGE value: typed_config: "@type": "type.googleapis.com/udpa.type.v1.TypedStruct" type_url: "type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager" value: access_log: - name: envoy.access_loggers.file typed_config: "@type": "type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog" path: /dev/stdout format: | { "timestamp": "%START_TIME%", "source_ip": "%REQ(X-FORWARDED-FOR)%", "destination_ip": "%DOWNSTREAM_LOCAL_ADDRESS%", "method": "%REQ(:METHOD)%", "path": "%REQ(:PATH)%", "status_code": "%RESPONSE_CODE%", "response_time": "%DURATION%", "bytes_sent": "%BYTES_SENT%", "bytes_received": "%BYTES_RECEIVED%", "service_name": "%UPSTREAM_HOST%", "trace_id": "%REQ(X-B3-TraceId)%" } # 同理配置SIDECAR_OUTBOUND/INGRESSGATEWAY日志格式3. 指标远程写入配置(prometheus-remote-write.yaml)
yaml
apiVersion: v1 kind: ConfigMap metadata: name: prometheus-remote-config namespace: istio-system data: prometheus.yml: | global: scrape_interval: 15s scrape_configs: - job_name: 'istio-proxies' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] action: replace target_label: __address__ regex: (.+) replacement: $1:15020 # Envoy指标端口 remote_write: - url: "http://prometheus-remote-write.monitoring.svc.cluster.local:9090/api/v1/write" # 远端Prometheus地址 queue_config: max_samples_per_send: 1000 max_shards: 200 capacity: 25004. Fluent Bit 日志采集配置(fluent-bit-config.yaml)
yaml
apiVersion: v1 kind: ConfigMap metadata: name: fluent-bit-config namespace: istio-system data: fluent-bit.conf: | [SERVICE] Flush 1 Log_Level info Daemon off Parsers_File parsers.conf [INPUT] Name tail Path /var/log/containers/*.log Parser docker Tag kube.* Refresh_Interval 10 Mem_Buf_Limit 5MB Skip_Long_Lines On [FILTER] Name grep Match kube.* Exclude log lvl=debug # 过滤调试日志 Include log service_name=* # 仅保留Istio服务日志 [OUTPUT] Name es Match * Host elasticsearch-master.logging.svc.cluster.local Port 9200 Index istio-access-logs Logstash_Format On Logstash_Prefix istio-logs Retry_Limit False5. 追踪采样配置(istio-tracing.yaml)
yaml
apiVersion: networking.istio.io/v1alpha3 kind: MeshPolicy metadata: name: default namespace: istio-system spec: tracing: sampling: 100 # 生产环境建议5-10(按流量规模调整) customTags: service_name: literal: value: "%UPSTREAM_HOST%"三、部署与验证步骤
安装 Istio 控制平面
bash
istioctl install -f istio-config.yaml -y部署 EnvoyFilter 与日志采集组件
bash
kubectl apply -f istio-accesslog.yaml kubectl apply -f fluent-bit-config.yaml kubectl apply -f prometheus-remote-write.yaml注入 Sidecar 到业务命名空间
bash
kubectl label namespace default istio-injection=enabled验证数据采集
- 指标:访问
http://<prometheus-remote-addr>/graph,查询istio_requests_total - 日志:在 Elasticsearch 中检索索引
istio-logs-*,查看结构化日志 - 追踪:访问 Jaeger UI,筛选服务名查看调用链路
- 指标:访问
四、生产级优化备注
- 资源限制:Sidecar 默认资源需根据业务流量调整(高并发场景建议 CPU 500m+);
- 日志过滤:通过 Fluent Bit 的
grep过滤器过滤无用日志(如健康检查请求); - 采样策略:追踪采样率建议按 QPS 动态调整(QPS>1000 时采样率设为 1%);
- 监控告警:配置
istio_requests_total{status_code=~"5.."}告警规则,监控服务异常; - 安全配置:日志 / 指标传输启用 TLS(对接远端服务时配置
tls:参数)。