Skip to content

未分类 - 14. page

1.CurrentActivity.sql

 

set lines 150
set pages 999
clear col

set termout off
set trimout on
set trimspool on

REM
REM Current transactions
REM
REM Will show only last transaction by a user
REM
REM May need to use 786472.1 for better picture
REM of activity

connect / as sysdba
alter session set nls_date_format=’dd-Mon-yyyy hh24:mi’;

col username format a10 wrapped heading “User”
col name format a22 wrapped heading “Undo Segment Name”
col xidusn heading “Undo|Seg #”
col xidslot heading “Undo|Slot #”
col xidsqn heading “Undo|Seq #”
col ubafil heading “File #”
col ubablk heading “Block #”
col start_time format a10 word_wrapped heading “Started”
col status format a8 heading “Status”
col blk format 999,999,999 heading “KBytes”
col used_urec heading “Rows”

spool undoactivity.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## Current Uncommitted Transactions ##############
prompt

select start_time, username, r.name,
ubafil, ubablk, t.status, (used_ublk*p.value)/1024 blk, used_urec
from v$transaction t, v$rollname r, v$session s, v$parameter p
where xidusn=usn
and s.saddr=t.ses_addr
and p.name=’db_block_size’
order by 1;
spool off

set termout on
set trimout off
set trimspool off
clear col

 

2.LobData.sql

 

REM List table and column information for LOBs for a specific user
REM
REM UNDO handling with LOBs is not designed for frequent updates
REM Frequent updates are best handled with PCTVERSION at 100
REM This means you must have a lot of space available in the LOB
REM tablespace as all UNDO will be maintained over time.
REM
REM Using RETENTION does not work as expected
REM It is set to UNDO_RETENTION at the time of the creation of the
REM object. It does not change over time as UNDO_RETENTION
REM or auto-tuned undo retention changes.

set pages 999
set lines 110

spool lobdata.out

col column_name format a25 head “Column”
col table_name format a25 head “Table”
col tablespace_name format a25 head “Tablespace”
col pctversion format 999 head “PCTVersion %”
col segment_space_management format a30 head “Space|Mngmnt”
col retention format 999,999,999 head “Retention”

select l.table_name, l.column_name, l.tablespace_name, l.pctversion, l.retention,
t.segment_space_management
from dba_lobs l, dba_tablespaces t
where owner=upper(‘&USER’)
and l.tablespace_name = t.tablespace_name
/

spool off

 

3. undoconfig.sql

spool UndoConfig.out

ttitle off
set pages 999
set lines 150
set verify off

set termout off
set trimout on
set trimspool on

REM
REM ————————————————————————

REM
REM —————————————————————–
REM

set space 2

REM REPORTING TABLESPACE INFORMATION:
REM
REM This looks at Tablespace Sizing – Total bytes and free bytes
REM

column tablespace_name format a30 heading ‘TS Name’
column sbytes format 9,999,999,999 heading ‘Total MBytes’
column fbytes format 9,999,999,999 heading ‘Free MBytes’
column file_name format a30 heading ‘File Name’
column kount format 999 heading ‘Ext’

compute sum of fbytes on tablespace_name
compute sum of sbytes on tablespace_name
compute sum of sbytes on report
compute sum of fbytes on report

break on tablespace_name skip 2

select a.tablespace_name, a.file_name, round(a.bytes/1024/1024,0) sbytes,
round(sum(b.bytes/1024/1024),0) fbytes, count(*) kount, autoextensible
from dba_data_files a, dba_free_space b
where a.file_id = b.file_id
and a.tablespace_name in (select z.tablespace_name from dba_tablespaces z where retention like ‘%GUARANTEE’)
group by a.tablespace_name, a.file_name, a.bytes, autoextensible
order by a.tablespace_name
/

set linesize 160

REM
REM If you can significantly reduce physical reads by adding incremental
REM data buffers…do it. To determine whether adding data buffers will
REM help, set db_block_lru_statistics = TRUE and
REM db_block_lru_extended_statistics = TRUE in the init.ora parameters.
REM You can determine how many extra hits you would get from memory as
REM opposed to physical I/O from disk. **NOTE: Turning these on will
REM impact performance. One shift of statistics gathering should be enough
REM to get the required information.
REM

REM
REM —————————————————————–
REM

set lines 160

col tablespace_name format a30 heading “Tablespace”
col tb format a15 heading “TB Status”
col df format a10 heading “DF Status”
col extent_management format a15 heading “Extent|Management”
col allocation_type format a8 heading “Type”
col segment_space_management format a7 heading “Auto|Segment”
col retention format a11 heading “Retention|Level”
col autoextensible format a5 heading “Auto?”
col mx format 999,999,999 heading “Max Allowed”

select t.tablespace_name, t.status tb, d.status df,
extent_management, allocation_type, segment_space_management, retention,
autoextensible, (maxbytes/1024/1024) mx
from dba_tablespaces t, dba_data_files d
where t.tablespace_name = d.tablespace_name
and retention like ‘%GUARANTEE’
/

col status format a20 head “Status”
col cnt format 999,999,999 head “How Many?”

select status, count(*) cnt
from dba_rollback_segs
group by status
/

spool off

set termout on
set trimout off
set trimspool off

4.undodatafiles.sql

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-Mon-yyyy hh24:mi’;

spool undodatafiles.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## DATAFILES ##############
prompt

col retention head “Retention”
col tablespace_name format a30 head “TBSP Name”
col file_id format 999 head “File #”
col a format 999,999,999,999,999 head “Bytes Alloc (MB)”
col b format 999,999,999,999,999 head “Max Bytes Used (MB)”
col autoextensible head “Auto|Ext”
col extent_management head “Ext Mngmnt”
col allocation_type head “Type”
col segment_space_management head “SSM”

select tablespace_name, file_id, sum(bytes)/1024/1024 a,
sum(maxbytes)/1024/1024 b,
autoextensible
from dba_data_files
where tablespace_name in (select tablespace_name from dba_tablespaces
where retention like ‘%GUARANTEE’ )
group by file_id, tablespace_name, autoextensible
order by tablespace_name
/

spool off
set termout on
set trimout off
set trimspool off
clear col

5.undoextends.sql

spool UndoExts.out

ttitle off
set pages 999
set lines 150
set verify off

set termout off
set trimout on
set trimspool on

REM
REM ————————————————————————

REM
REM —————————————————————–
REM

REM
REM REPORTING UNDO EXTENTS INFORMATION:
REM
REM —————————————————————–
REM
REM Undo Extents breakdown information
REM

ttitle center “Rollback Segments Breakdown” skip 2

col status format a20
col cnt format 999,999,999 head “How Many?”

select status, count(*) cnt from dba_rollback_segs
group by status
/

ttitle center “Undo Extents” skip 2

col segment_name format a30 heading “Name”
col “ACT BYTES” format 999,999,999,999 head “Active|Extents”
col “UNEXP BYTES” format 999,999,999,999 head “Unxpired|Extents”
col “EXP BYTES” format 999,999,999,999 head “Expired|Extents”

select segment_name,
nvl(sum(act),0) “ACT BYTES”,
nvl(sum(unexp),0) “UNEXP BYTES”,
nvl(sum(exp),0) “EXP BYTES”
from (
select segment_name,
nvl(sum(bytes),0) act,00 unexp, 00 exp
from DBA_UNDO_EXTENTS
where status=’ACTIVE’ group by segment_name
union
select segment_name,
00 act, nvl(sum(bytes),0) unexp, 00 exp
from DBA_UNDO_EXTENTS
where status=’UNEXPIRED’ group by segment_name
union
select segment_name,
00 act, 00 unexp, nvl(sum(bytes),0) exp
from DBA_UNDO_EXTENTS
where status=’EXPIRED’ group by segment_name
) group by segment_name;

ttitle center “Undo Extents Statistics” skip 2

col size format 999,999,999,999 heading “Size”
col “HOW MANY” format 999,999,999 heading “How Many?”
col st heading a12 heading “Status”

select distinct status st, count(*) “HOW MANY”, sum(bytes) “SIZE”
from dba_undo_extents
group by status
/

col segment_name format a30 heading “Name”
col TABLESPACE_NAME for a20
col BYTES for 999,999,999,999
col BLOCKS for 999,999,999
col status for a15 heading “Status”
col segment_name heading “Segment”
col extent_id heading “ID”

select SEGMENT_NAME, TABLESPACE_NAME, EXTENT_ID,
FILE_ID, BLOCK_ID, BYTES, BLOCKS, STATUS
from dba_undo_extents
order by 1,3,4,5
/

REM
REM —————————————————————–
REM
REM Undo Extents Contention breakdown
REM Take out column TUNED_UNDORETENTION if customer
REM prior to 10.2.x
REM
REM The time frame can be adjusted with this query
REM By default using around 4 hour window of time
REM
REM Ex.
REM Using sysdate-.04 looking at the last hour
REM Using sysdate-.16 looking at the last 4 hours
REM Using sysdate-.32 looking at the last 8 hours
REM Using sysdate-1 looking at the last 24 hours
REM

set linesize 140

ttitle center “Undo Extents Error Conditions (Default – Last 4 Hours)” skip 2

col UNXPSTEALCNT format 999,999,999 heading “# Unexpired|Stolen”
col EXPSTEALCNT format 999,999,999 heading “# Expired|Reused”
col SSOLDERRCNT format 999,999,999 heading “ORA-1555|Error”
col NOSPACEERRCNT format 999,999,999 heading “Out-Of-space|Error”
col MAXQUERYLEN format 999,999,999 heading “Max Query|Length”
col TUNED_UNDORETENTION format 999,999,999 heading “Auto-Ajusted|Undo Retention”
col hours format 999,999 heading “Tuned|(HRs)”

select inst_id, to_char(begin_time,’MM/DD/YYYY HH24:MI’) begin_time,
UNXPSTEALCNT, EXPSTEALCNT , SSOLDERRCNT, NOSPACEERRCNT, MAXQUERYLEN,
TUNED_UNDORETENTION, TUNED_UNDORETENTION/60/60 hours
from gv$undostat
where begin_time between (sysdate-.16)
and sysdate
order by inst_id, begin_time
/

spool off
set termout on
set trimout off
set trimspool off

6.undoheathcheck.sql

spool Undohealth.out

ttitle off
set pages 999
set lines 150
set verify off
set termout off
set trimout on
set trimspool on

REM
REM ————————————————————————

col name format a30
col gets format 9,999,999
col waits format 9,999,999

PROMPT ROLLBACK HIT STATISTICS:
REM

REM GETS – # of gets on the rollback segment header
REM WAITS – # of waits for the rollback segment header

set head on;

select name, waits, gets
from v$rollstat, v$rollname
where v$rollstat.usn = v$rollname.usn
/

col pct head “< 2% ideal” select ‘The average of waits/gets is ‘|| round((sum(waits) / sum(gets)) * 100,2)||’%’ PCT From v$rollstat / PROMPT REDO CONTENTION STATISTICS: REM REM If the ratio of waits to gets is more than 1% or 2%, consider REM creating more rollback segments REM REM Another way to gauge rollback contention is: REM column xn1 format 9999999 column xv1 new_value xxv1 noprint select class, count from v$waitstat where class in (‘system undo header’, ‘system undo block’, ‘undo header’, ‘undo block’ ) / set head off select ‘Total requests = ‘||sum(count) xn1, sum(count) xv1 from v$waitstat / select ‘Contention for system undo header = ‘|| (round(count/(&xxv1+0.00000000001),4)) * 100||’%’ from v$waitstat where class = ‘system undo header’ / select ‘Contention for system undo block = ‘|| (round(count/(&xxv1+0.00000000001),4)) * 100||’%’ from v$waitstat where class = ‘system undo block’ / select ‘Contention for undo header = ‘|| (round(count/(&xxv1+0.00000000001),4)) * 100||’%’ from v$waitstat where class = ‘undo header’ / select ‘Contention for undo block = ‘|| (round(count/(&xxv1+0.00000000001),4)) * 100||’%’ from v$waitstat where class = ‘undo block’ / REM REM NOTE: Not as useful with AUM configured REM REM If the percentage for an area is more than 1% or 2%, consider REM creating more rollback segments. Note: This value is usually very REM small REM and has been rounded to 4 places. REM REM ———————————————————————— REM REM The following shows how often user processes had to wait for space in REM the redo log buffer: select name||’ = ‘||value from v$sysstat where name = ‘redo log space requests’ / REM REM This value should be near 0. If this value increments consistently, REM processes have had to wait for space in the redo buffer. If this REM condition exists over time, increase the size of LOG_BUFFER in the REM init.ora file in increments of 5% until the value nears 0. REM ** NOTE: increasing the LOG_BUFFER value will increase total SGA size. REM REM ———————————————————————– col name format a15 col gets format 9999999 col misses format 9999999 col immediate_gets heading ‘IMMED GETS’ format 9999999 col immediate_misses heading ‘IMMED MISS’ format 9999999 col sleeps format 999999 PROMPT LATCH CONTENTION: REM REM GETS – # of successful willing-to-wait requests for a latch REM MISSES – # of times an initial willing-to-wait request was unsuccessful REM IMMEDIATE_GETS – # of successful immediate requests for each latch REM IMMEDIATE_MISSES = # of unsuccessful immediate requests for each latch REM SLEEPS – # of times a process waited and requests a latch after an REM initial willing-to-wait request REM REM If the latch requested with a willing-to-wait request is not REM available, the requesting process waits a short time and requests REM again. REM If the latch requested with an immediate request is not available, REM the requesting process does not wait, but continues processing REM set head on select name, gets, misses, immediate_gets, immediate_misses, sleeps from v$latch where name in (‘redo allocation’, ‘redo copy’) / set head off select ‘Ratio of MISSES to GETS: ‘|| round((sum(misses)/(sum(gets)+0.00000000001) * 100),2)||’%’ from v$latch where name in (‘redo allocation’, ‘redo copy’) / select ‘Ratio of IMMEDIATE_MISSES to IMMEDIATE_GETS: ‘|| round((sum(immediate_misses)/ (sum(immediate_misses+immediate_gets)+0.00000000001) * 100),2)||’%’ from v$latch where name in (‘redo allocation’, ‘redo copy’) / set head on REM REM If either ratio exceeds 1%, performance will be affected. REM REM Decreasing the size of LOG_SMALL_ENTRY_MAX_SIZE reduces the number of REM processes copying information on the redo allocation latch. REM REM Increasing the size of LOG_SIMULTANEOUS_COPIES will reduce contention REM for redo copy latches. REM REM —————————————————————– REM This looks at overall i/o activity against individual REM files within a tablespace REM REM Look for a mismatch across disk drives in terms of I/O REM REM Also, examine the Blocks per Read Ratio for heavily accessed REM TSs – if this value is significantly above 1 then you may have REM full tablescans occurring (with multi-block I/O) REM REM If activity on the files is unbalanced, move files around to balance REM the load. Should see an approximately even set of numbers across files REM set space 1 PROMPT REPORTING I/O STATISTICS: column pbr format 99999999 heading ‘Physical|Blk Read’ column pbw format 999999 heading ‘Physical|Blks Wrtn’ column pyr format 999999 heading ‘Physical|Reads’ column readtim format 99999999 heading ‘Read|Time’ column name format a55 heading ‘DataFile Name’ column writetim format 99999999 heading ‘Write|Time’ compute sum of f.phyblkrd, f.phyblkwrt on report select fs.name name, f.phyblkrd pbr, f.phyblkwrt pbw, f.readtim, f.writetim from v$filestat f, v$datafile fs where f.file# = fs.file# order by fs.name / REM REM —————————————————————– PROMPT GENERATING WAIT STATISTICS: REM REM This will show wait stats for certain kernel instances. This REM may show the need for additional rbs, wait lists, db_buffers REM column class heading ‘Class Type’ column count heading ‘Times Waited’ format 99,999,999 column time heading ‘Total Times’ format 99,999,999 select class, count, time from v$waitstat where count > 0
order by class
/

REM
REM Look at the wait statistics generated above (if any). They will
REM tell you where there is contention in the system. There will
REM usually be some contention in any system – but if the ratio of
REM waits for a particular operation starts to rise, you may need to
REM add additional resource, such as more database buffers, log buffers,
REM or rollback segments
REM
REM —————————————————————–

PROMPT ROLLBACK EXTENT STATISTICS:
REM

column usn format 999 heading ‘Undo #’
column extents format 999 heading ‘Extents’
column rssize format 999,999,999 heading ‘Size in|Bytes’
column optsize format 999,999,999 heading ‘Optimal|Size’
column hwmsize format 99,999,999 heading ‘High Water|Mark’
column shrinks format 9,999 heading ‘Num of|Shrinks’
column wraps format 9,999 heading ‘Num of|Wraps’
column extends format 999,999 heading ‘Num of|Extends’
column aveactive format 999,999,999 heading ‘Average size|Active Extents’
column rownum noprint

select usn, extents, rssize, optsize, hwmsize,
shrinks, wraps, extends, aveactive
from v$rollstat
order by rownum
/

spool off
set termout on
set trimout off
set trimspool off

 

7.undohistoryinfo.sql

 

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-hh24:mi’;

spool undohistoryinfo.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## HISTORICAL DATA ##############
prompt

col x format 999,999 head “Max Concurrent|Last 7 Days”
col y format 999,999 head “Max Concurrent|Since Startup”

select max(maxconcurrency) x from v$undostat
/
select max(maxconcurrency) y from sys.wrh$_undostat
/

col i format 999,999 head “1555 Errors”
col j format 999,999 head “Undo Space Errors”

select sum(ssolderrcnt) i from v$undostat
where end_time > sysdate-2
/

select sum(nospaceerrcnt) j from v$undostat
where end_time > sysdate-2
/
clear break
clear compute

prompt
prompt ############## CURRENT STATUS OF SEGMENTS ##############
prompt ############## SNAPSHOT IN TIME INFO ##############
prompt ##############(SHOWS CURRENT UNDO ACTIVITY)##############
prompt

col segment_name format a30 head “Segment Name”
col “ACT BYTES” format 999,999,999,999 head “Active Bytes”
col “UNEXP BYTES” format 999,999,999,999 head “Unexpired Bytes”
col “EXP BYTES” format 999,999,999,999 head “Expired Bytes”

select segment_name, nvl(sum(act),0) “ACT BYTES”,
nvl(sum(unexp),0) “UNEXP BYTES”,
nvl(sum(exp),0) “EXP BYTES”
from (select segment_name, nvl(sum(bytes),0) act,00 unexp, 00 exp
from dba_undo_extents where status=’ACTIVE’ group by segment_name
union
select segment_name, 00 act, nvl(sum(bytes),0) unexp, 00 exp
from dba_undo_extents where status=’UNEXPIRED’ group by segment_name
union
select segment_name, 00 act, 00 unexp, nvl(sum(bytes),0) exp
from dba_undo_extents where status=’EXPIRED’ group by segment_name)
group by segment_name
order by 1
/

prompt
prompt ############## UNDO SPACE USAGE ##############
prompt

col usn format 999,999 head “Segment#”
col shrinks format 999,999,999 head “Shrinks”
col aveshrink format 999,999,999 head “Avg Shrink Size”

select usn, shrinks, aveshrink from v$rollstat
/
spool off
set termout on
set trimout off
set trimspool off
clear col

8.undoparameters.sql

 

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-Mon-yyyy hh24:mi’;

spool undoparameters.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

col inst_id format 999 head “Instance #”
col Parameter format a35 wrap
col “Session Value” format a25 wrapped
col “Instance Value” format a25 wrapped

prompt
prompt ############## PARAMETERS ##############
prompt

select a.inst_id, a.ksppinm “Parameter”,
b.ksppstvl “Session Value”,
c.ksppstvl “Instance Value”
from x$ksppi a, x$ksppcv b, x$ksppsv c
where a.indx = b.indx and a.indx = c.indx
and a.inst_id=b.inst_id and b.inst_id=c.inst_id
and a.ksppinm in (‘_undo_autotune’, ‘_smu_debug_mode’,
‘_highthreshold_undoretention’,
‘undo_tablespace’,’undo_retention’,’undo_management’)
order by 2;

spool off
set termout on
set trimout off
set trimspool off
clear col

 

9. undopressure.sql

 

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-hh24:mi’;

spool undopressure.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## WAITS FOR UNDO (Since Startup) ##############
prompt

col inst_id head “Instance#”
col eq_type format a3 head “Enq”
col total_req# format 999,999,999,999,999,999 head “Total Requests”
col total_wait# format 999,999 head “Total Waits”
col succ_req# format 999,999,999,999,999,999 head “Successes”
col failed_req# format 999,999,999999 head “Failures”
col cum_wait_time format 999,999,999 head “Cummalitve|Time”

select * from v$enqueue_stat where eq_type=’US’
union
select * from v$enqueue_stat where eq_type=’HW’
/

prompt
prompt ############## LOCKS FOR UNDO ##############
prompt

col addr head “ADDR”
col KADDR head “KADDR”
col sid head “Session”
col osuser format a10 head “OS User”
col machine format a15 head “Machine”
col program format a17 head “Program”
col process format a7 head “Process”
col lmode head “Lmode”
col request head “Request”
col ctime format 9,999 head “Time|(Mins)”
col block head “Blocking?”

select /*+ RULE */ a.SID, b.process,
b.OSUSER, b.MACHINE, b.PROGRAM,
addr, kaddr, lmode, request, round(ctime/60/60,0) ctime, block
from
v$lock a,
v$session b
where
a.sid=b.sid
and a.type=’US’
/

prompt
prompt ############## TUNED RETENTION HISTORY (Last 2 Days) ##############
prompt ############## LOWEST AND HIGHEST DATA ##############
prompt

col low format 999,999,999,999 head “Undo Retention|Lowest Tuned Value”
col high format 999,999,999,999 head “Undo Retention|Highest Tuned Value”

select end_time, tuned_undoretention from v$undostat where tuned_undoretention = (
select min(tuned_undoretention) low
from v$undostat
where end_time > sysdate-2)
/

select end_time, tuned_undoretention from v$undostat where tuned_undoretention = (
select max(tuned_undoretention) high
from v$undostat
where end_time > sysdate-2)
/

prompt
prompt ############## CURRENT TRANSACTIONS ##############
prompt

col sql_text format a40 word_wrapped head “SQL Code”

select a.start_date, a.start_scn, a.status, c.sql_text
from v$transaction a, v$session b, v$sqlarea c
where b.saddr=a.ses_addr and c.address=b.sql_address
and b.sql_hash_value=c.hash_value
/

select current_scn from v$database
/

col a format 999,999 head “UnexStolen”
col b format 999,999 head “ExStolen”
col c format 999,999 head “UnexReuse”
col d format 999,999 head “ExReuse”

prompt
prompt ############## WHO’S STEALING WHAT? (Last 2 Days) ##############
prompt

select unxpstealcnt a, expstealcnt b,
unxpblkreucnt c, expblkreucnt d
from v$undostat
where (unxpstealcnt > 0 or expstealcnt > 0)
and end_time > sysdate-2
/

spool off
set termout on
set trimout off
set trimspool off
clear col

 

10. undostatistics.sql

 

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-hh24:mi’;

spool undostatistics.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

col current_scn head “SCN Now”
col start_date head “Trans Started”
col start_scn head “SCN for Trans”
col ses_addr head “ADDR”

prompt
prompt ############## Historical V$UNDOSTAT (Last 2 Days) ##############
prompt

col end_time format a18 Head “Date/Time”
col maxq format 999,999 head “Query|Maximum|Minutes”
col maxquerysqlid head “SqlID”
col undotsn format 999,999 head “TBS”
col undoblks format 999,999,999 head “Undo|Blocks”
col txncount format 999,999,999 head “# of|Trans”
col unexpiredblks format 999,999,999 head “# of Unexpired”
col expiredblks format 999,999,999 head “# of Expired”
col tuned format 999,999 head “Tuned Retention|(Minutes)”

select end_time, round(maxquerylen/60,0) maxq, maxquerysqlid,
undotsn, undoblks, txncount, unexpiredblks, expiredblks,
round(tuned_undoretention/60,0) Tuned
from dba_hist_undostat
where end_time > sysdate-2
order by 1
/

prompt
prompt ############## RECENT MISSES FOR UNDO (Last 2 Days) ##############
prompt

clear col
set lines 500
select * from v$undostat where maxquerylen > tuned_undoretention
and end_time > sysdate-2
order by 2
/

select * from sys.wrh$_undostat where maxquerylen > tuned_undoretention
and end_time > sysdate-2
order by 2
/

prompt
prompt ############## AUTO-TUNING TUNE-DOWN DATA ##############
prompt ############## ROLLBACK DATA (Since Startup) ##############
prompt

col name format a60 head “Name”
col value format 999,999,999 head “Counters”

select name, value from v$sysstat
where name like ‘%down retention%’ or name like ‘une down%’
or name like ‘%undo segment%’ or name like ‘%rollback%’
or name like ‘%undo record%’
/

prompt
prompt ############## Long Running Query History ##############
prompt

col end_time head “Date”
col maxquerysqlid head “SQL ID”
col runawayquerysqlid format a15 head “Runaway SQL ID”
col results format a35 word_wrapped head “Space Issues”
col status head “Status”
col newret head “Tuned Down|Retention”

select end_time, maxquerysqlid, runawayquerysqlid, status,
decode(status,1,’Slot Active’,4,’Reached Best Retention’,5,’Reached Best Retention’,
8, ‘Runaway Query’,9,’Runaway Query-Active’,10,’Space Pressure’,
11,’Space Pressure Currently’,
16, ‘Tuned Down (to undo_retention) due to Space Pressure’,
17,’Tuned Down (to undo_retention) due to Space Pressure-Active’,
18, ‘Tuning Down due to Runaway’, 19, ‘Tuning Down due to Runaway-Active’,
28, ‘Runaway tuned down to last tune down value’,
29, ‘Runaway tuned down to last tune down value’,
32, ‘Max Tuned Down – Not Auto-Tuning’,
33, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
37, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
38, ‘Max Tuned Down – Not Auto-Tuning’,
39, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
40, ‘Max Tuned Down – Not Auto-Tuning’,
41, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
42, ‘Max Tuned Down – Not Auto-Tuning’,
44, ‘Max Tuned Down – Not Auto-Tuning’,
45, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
‘Other (‘||status||’)’) Results, spcprs_retention NewRet
from sys.wrh$_undostat
where status > 1
/

prompt
prompt ############## Details on Long Run Queries ##############
prompt

col sql_fulltext head “SQL Text”
Col sql_id heading “SQL ID”

select sql_id, sql_fulltext, last_load_time “Last Load”,
round(elapsed_time/1000000/60/60/24,0) “Elapsed Days”
from v$sql where sql_id in
(select maxquerysqlid from sys.wrh$_undostat
where status > 1)
/

spool off
set termout on
set trimout off
set trimspool off
clear col

 

11.

 

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-Mon-yyyy hh24:mi’;

spool undousage.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## IN USE Undo Data ##############
prompt

select
((select (nvl(sum(bytes),0))
from dba_undo_extents
where tablespace_name in (select tablespace_name from dba_tablespaces
where retention like ‘%GUARANTEE’ )
and status in (‘ACTIVE’,’UNEXPIRED’)) *100) /
(select sum(bytes)
from dba_data_files
where tablespace_name in (select tablespace_name from dba_tablespaces
where retention like ‘%GUARANTEE’ )) “PCT_INUSE”
from dual;

select tablespace_name, extent_management, allocation_type,
segment_space_management, retention
from dba_tablespaces where retention like ‘%GUARANTEE’
/

col c format 999,999,999,999 head “Sum of Free”

select (nvl(sum(bytes),0)) c from dba_free_space
where tablespace_name in
(select tablespace_name from dba_tablespaces where retention like ‘%GUARANTEE’)
/

col d format 999,999,999,999 head “Total Bytes”

select sum(bytes) d from dba_data_files
where tablespace_name in
(select tablespace_name from dba_tablespaces where retention like ‘%GUARANTEE’)
/

PROMPT
PROMPT ############## UNDO SEGMENTS ##############
PROMPT

col status head “Status”
col z format 999,999 head “Total Extents”
break on report
compute sum on report of z

select status, count(*) z from dba_undo_extents
group by status
/

col z format 999,999 head “Undo Segments”

select status, count(*) z from dba_rollback_segs
group by status
/

clear break
clear compute

prompt
prompt ############## CURRENT STATUS OF SEGMENTS ##############
prompt ############## SNAPSHOT IN TIME INFO ##############
prompt ##############(SHOWS CURRENT UNDO ACTIVITY)##############
prompt

col segment_name format a30 head “Segment Name”
col “ACT BYTES” format 999,999,999,999 head “Active Bytes”
col “UNEXP BYTES” format 999,999,999,999 head “Unexpired Bytes”
col “EXP BYTES” format 999,999,999,999 head “Expired Bytes”

select segment_name, nvl(sum(act),0) “ACT BYTES”,
nvl(sum(unexp),0) “UNEXP BYTES”,
nvl(sum(exp),0) “EXP BYTES”
from (select segment_name, nvl(sum(bytes),0) act,00 unexp, 00 exp
from dba_undo_extents where status=’ACTIVE’ group by segment_name
union
select segment_name, 00 act, nvl(sum(bytes),0) unexp, 00 exp
from dba_undo_extents where status=’UNEXPIRED’ group by segment_name
union
select segment_name, 00 act, 00 unexp, nvl(sum(bytes),0) exp
from dba_undo_extents where status=’EXPIRED’ group by segment_name)
group by segment_name
order by 1
/

prompt
prompt ############## UNDO SPACE USAGE ##############
prompt

col usn format 999,999 head “Segment#”
col shrinks format 999,999,999 head “Shrinks”
col aveshrink format 999,999,999 head “Avg Shrink Size”

select usn, shrinks, aveshrink from v$rollstat
/
spool off
set termout on
set trimout off
set trimspool off
clear col

自动undo管理模式下的undo错误分析常用脚本

归档日志间隙(Archive Gap)

 

归档日志间隙是在 Standby 端一系列丢失的重做日志,导致日志应用服务无法继续运行。这通常发生在 Standby 端无法从 Primary Database 接收重做日志或重做日志在 Standby Database 上不可用时。常见原因有:

  • 网络连接断开或者日志传输服务停止
  • Standby Database 不可用
  • 日志传输服务的配置错误
  • Standby 端的 IO 问题
  • 归档日志在应用到 Standby 前被手工删除
  • Primary 和 Standby 之间的网络带宽不足

一旦在 Standby Database 上存在归档间隙,Log Apply Services 就会卡住,直到日志间隙(Gap)被解决,例如。丢失的 Redo 被重新获取并且在 Standby 端可用。然后,日志应用服务可以选中它并继续处理。

 

解决日志间隙的方式

 

有4种方案来解决 Standby Database 上的日志间隙。这些方案在下面讨论。

自动日志间隙解决方案

自动日志间隙解决方案是由日志传输服务自动进行的。它会把当前正在传输的日志和最近收到的日志进行对比,如果有不匹配的情况出现,Standby 端的 RFS 进程就会检测到并自动发送 ARCH-RFS 心跳 Ping 请求来要求发送丢失的日志。这种类型的日志间隙解决方法使用了主数据库上的 log_archive_dest_n 中定义的 Service。另外 ARCH-RFS 心跳 Ping 可以通过对当前的日子序列号进行查询来检测日志间隙。如果存在日志间隙则仍通过ARCH-RFS 心跳 Ping 请求来解决它。在问题得到解决后,会通知日志传输进程(ARCH 或者 LGWR)。对于自动日志间隙解决方案,不需要额外的设置或者监控。

 

FAL (Fetch Archive Log)日志间隙解决方案

当一个归档日志在 Standby 数据库上被收到或者归档,它就会被注册到 Standby 的控制文件中。(您可以在物理 Standby 数据库上查询 v$archived_log 或在逻辑 Standby 数据库上查询 dba_logstdby_log 来获取这些注册信息)。如果这个文件因为某些原因丢失或者损坏(比如,它被意外删除),FAL 就会被调用来解决日志间隙问题。因为这些缺失的日志文件通常由 Standby 数据库上的日志应用服务检测到。它独立于日志传输服务,并且没有和主库之间的直接链接。要使用 FAL,必须在 Standby 数据库设置一个或者两个(11.2.0 之前的版本)初始化参数:

FAL_SERVER:设置 Oracle Net Service Name(TNS-Alias 或者 Connect Descriptor)指向用来获取丢失的归档日志的数据库。它可以是一个 Data Guard 环境的主库,或者是另一个备库,ArchiveLog Repository- 或者 Far Sync Standby (> Oracle 12.1.0) Database。可以指定多个 Service Names(逗号分隔)。FAL 会顺序的尝试这些数据库来解决日志间隙问题。

FAL_CLIENT (< Oracle 11.2.0):在 FAL_SERVER 数据库上设置 Oracle Net Service Name(TNS-Alias 或 Connect Descriptor)指向接收 REDO 的 Standby 数据库(比如,它是 FAL_SERVER 数据库需要发送 REDO 到的目标数据库)。确保这个 TNS-Alias 存在于 FAL_SERVER 数据库的 TNSNAMES.ORA 文件中。从 Oracle 11.2.0 开始,这个参数不再需要。但是需要确保 FAL_SERVER 数据库存在一个log_archive_dest_n 指向要解决日志间隙问题的 Standby 数据库。

当 Log ApplyServices 检测到日志间隙问题,它会发送一个 FAL 请求把 FAL_CLIENT 信息(Version > 11.1.0 则为 db_unique_name)给 FAL_SERVER。一个 FAL_SERVER 数据库上的 ARCH-Process 会尝试获取那个日志并发送回 FAL_CLIENT(或者 db_unique_name 对应的 Destination)。如果第一个 FAL_SERVER 无法解决日志间隙,会尝试列表中的下一个 FAL_SERVER。如果所有的 FAL_SERVERs 都无法解决,那么 FAL 请求会失败,并且一个对应的错误信息会写入对应 Standby 数据库的 ALERT.LOG。

为了成功解决日志间隙问题,需要的归档日志应当存在于 FAL_SERVER 数据库(存在于磁盘并且对应的信息同时存在于控制文件中)。

FAL 从 Oracle 9.2.0 开始可用于物理 Standby 数据库,从 Oracle 10.1.0 开始可用于逻辑 Standby 数据库。

 

手工解决日志间隙

如果日志间隙问题不能被上面提到方式解决,那么可以尝试手工解决。

可以通过查询物理 Standby 数据库的 $archive_gap 或者逻辑 Standby 数据库的 dba_logstdby_log 来确定当前的归档日志间隙,例如:

 

物理 Standby 数据库

SQL> select * from v$archive_gap;

逻辑 Standby 数据库

SQL> select thread#, sequence# from dba_logstdby_log l where next_change# not in

(select first_change# from dba_logstdby_log where l.thread#=thread#)

order by thread#, sequence#;

 

现在复制缺失的特定编号的 redo 日志到 Standby 数据库的对应位置。如缺失的日志尚未注册到 Standby 数据库,需要先注册它们才能让 Log Apply Services 处理这些日志文件。可以使用下面的命令注册:

物理 Standby:

SQL> alter database register logfile ‘<File-Specification>’;

逻辑 Standby:

SQL> alter database register logical logfile ‘<File-Specification>’;

 

在它们被注册后,Log Apply Services 就可以处理了。

 

使用增量备份前滚(仅适用于物理 Standby)

如果日志间隙无法被上面提到的方式解决,间隙太大需要太久时间才能解决或者丢失的日志无法被找到,您仍然可以通过使用 SCN 增量备份来前滚物理 Standby 数据库。这个功能从 Oracle 10.2.0 开始可以使用。这个功能通过记录最后应用到 Standby 数据库的SCN,然后使用 RMAN 以及当前控制文件的备份来对主库创建一个从那个 SCN 开始的增量备份。

之后首先用增量备份中的控制文件替换 Standby 的控制文件,之后应用增量备份到 Standby 数据库。这是一个把 Standby 数据库同步到最新的主库的状态的最快最简单的方式。因为采取的步骤各个版本都不同

Oracle dataguard 日志缺失的解决办法

概述

 

某客户由存储厂商维护人员将LUN存储划分到EMR数据库主机上;在此期间由于不当操作,导致数据库集群状态异常,之后重启了数据库主机节点2,重启主机后集群软件GRID无法正常启动,节点1仍可以正常提供数据库服务。通过远程方式接入数据库环境进行故障排查,通过对相关日志等信息的深入分析,最终找到问题点并进行处理,数据库集群恢复正常。

 

集群及数据库日志分析

1. 15:36分主机重启前的错误信息

从此处日志可以看出,在15:36分时数据库实例报IO读写错误,同时数据库集群日志中也在显示报错,之后发生了主机重启动作。

 

2. 节点2重启主机后相关日志信息

节点2重启主机后,检查集群启动状态可以发现在启动到ASM实例时停止,检查ASM ALERT日志有如下信息:

集群ALERT日志:

2021-12-12 15:48:34.081:

[client(10295)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

[client(10405)]CRS-10001:12-Dec-19 15:48 ACFS-9391: Checking for existing ADVM/ACFS installation.

[client(10410)]CRS-10001:12-Dec-19 15:48 ACFS-9392: Validating ADVM/ACFS installation files for operating system.

[client(10412)]CRS-10001:12-Dec-19 15:48 ACFS-9393: Verifying ASM Administrator setup.

[client(10415)]CRS-10001:12-Dec-19 15:48 ACFS-9308: Loading installed ADVM/ACFS drivers.

[client(10418)]CRS-10001:12-Dec-19 15:48 ACFS-9154: Loading ‘oracleoks.ko’ driver.

[client(10446)]CRS-10001:12-Dec-19 15:48 ACFS-9154: Loading ‘oracleadvm.ko’ driver.

[client(10473)]CRS-10001:12-Dec-19 15:48 ACFS-9154: Loading ‘oracleacfs.ko’ driver.

[client(10560)]CRS-10001:12-Dec-19 15:48 ACFS-9327: Verifying ADVM/ACFS devices.

[client(10564)]CRS-10001:12-Dec-19 15:48 ACFS-9156: Detecting control device ‘/dev/asm/.asm_ctl_spec’.

[client(10568)]CRS-10001:12-Dec-19 15:48 ACFS-9156: Detecting control device ‘/dev/ofsctl’.

[client(10573)]CRS-10001:12-Dec-19 15:48 ACFS-9322: completed

2021-12-12 15:48:43.930:

[client(10374)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

2021-12-12 15:48:53.698:

[client(10598)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

2021-12-12 15:49:03.129:

[client(10905)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

2021-12-12 15:50:00.511:

[client(12458)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

2021-12-12 15:51:07.276:

?

节点2:ASM ALERT日志

Thu Dec 12 16:22:11 2021

MMNL started with pid=21, OS id=26248

lmon registered with NM – instance number 2 (internal mem no 1)

Thu Dec 12 16:24:08 2021

LMON received an instance eviction notification from instance 1

The instance eviction reason is 0x20000000

The instance eviction map is 2

Thu Dec 12 16:24:11 2021

PMON (ospid: 26206): terminating the instance due to error 481

Thu Dec 12 16:24:11 2021

ORA-1092 : opitsk aborting process

Thu Dec 12 16:24:13 2021

System state dump requested by (instance=2, osid=26206 (PMON)), summary=[abnormal instance termination].

System State dumped to trace file /u01/app/grid/diag/asm/+asm/+ASM2/trace/+ASM2_diag_26216_20211212162413.trc

Dumping diagnostic data in directory=[cdmp_20211212162411], requested by (instance=2, osid=26206 (PMON)), summary=[abnormal instance termination].

Instance terminated by PMON, pid = 26206

检查对应的LMON及DIAG日志:

[grid@cxemrdb02 trace]$ more ./+ASM2_diag_26216.trc

Trace file /u01/app/grid/diag/asm/+asm/+ASM2/trace/+ASM2_diag_26216.trc

Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 – 64bit Production

With the Real Application Clusters and Automatic Storage Management options

ORACLE_HOME = /u01/app/11.2.0/grid

System name: ???Linux

Node name: ?????cxemrdb02

Release: ???????2.6.32-431.el6.x86_64

Version: ???????#1 SMP Sun Nov 10 22:19:54 EST 2013

Machine: ???????x86_64

Instance name: +ASM2

Redo thread mounted by this instance: 0 <none>

Oracle process number: 6

Unix process pid: 26216, image: oracle@cxemrdb02 (DIAG)

*** 2021-12-12 16:22:10.788

*** SESSION ID:(373.1) 2021-12-12 16:22:10.788

*** CLIENT ID:() 2021-12-12 16:22:10.788

*** SERVICE NAME:() 2021-12-12 16:22:10.788

*** MODULE NAME:() 2021-12-12 16:22:10.788

*** ACTION NAME:() 2021-12-12 16:22:10.788

Node id: 1

List of nodes: 0, 1,

*** 2021-12-12 16:22:10.790

Reconfiguration starts [incarn=0]

*** 2021-12-12 16:22:10.790

I’m the voting node

Group reconfiguration cleanup

kjzdattdlm: Can not attach to DLM (LMON up=[TRUE], DB mounted=[FALSE]).

kjzdattdlm: Can not attach to DLM (LMON up=[TRUE], DB mounted=[FALSE]).

kjzdattdlm: Can not attach to DLM (LMON up=[TRUE], DB mounted=[FALSE]).

 

3. 问题分析描述

 

通过对数据库集群相关日志的分析,可以发现在15:36分时的非正常操作,导致了数据库及集群报错并状态异常,此时进行了数据库主机的重启动作;重启后集群日志中显示ASM无法启动,根据ASM日志及TRACE文件信息,结合ORACLE官方文档中ASM on Non-First Node (Second or Others) Fails to Start: PMON (ospid: nnnn): terminating the instance due to error 481 (Doc ID 1383737.1)文档中描述的信息,可以判断是集群私网通信出现了问题,通过对集群私网相关配置进行分析、排查,最终确认是集群通信的socket文件(/var/tmp/.oracle)内容由于集群异常关闭导致不一致,从而引起集群节点2重启后不能正常启动。通过清理socket文件后集群可以正常启动 恢复正常。

 

 

 

 

 

三、总结与后续处理建议

1. 问题总结

 

通过对数据库集群相关日志的分析,可以发现在15:36分时的非正常操作,导致了数据库及集群报错并状态异常,此时进行了数据库主机的重启动作;重启后集群日志中显示ASM无法启动,根据ASM日志及TRACE文件信息,结合ORACLE官方文档中ASM on Non-First Node (Second or Others) Fails to Start: PMON (ospid: nnnn): terminating the instance due to error 481 (Doc ID 1383737.1)文档中描述的信息,可以判断是集群私网通信出现了问题;通过对集群私网相关配置进行分析、排查,最终确认是集群通信的socket文件(/var/tmp/.oracle)内容由于集群异常关闭导致不一致,从而引起集群节点2重启后与节点1通信异常,不能正常启动。通过清理socket文件后集群可以正常启动,数据库集群恢复正常。

 

 

 

4. 后续对于集群添加磁盘的建议

对于生产环境数据库集群,向ASM环境中添加磁盘是一个重要且有一点风险的动作,由于数据库所有数据都存放在ASM磁盘组中,因此如果向已有ASM磁盘组添加磁盘不当,可能导致ASM磁盘组读写异常或不能正常MOUNT,从而影响正常的数据库业务运行。因此结合ORACLE官方的标准步骤”如何在 RAC 集群或单机 ASM 环境中对已经存在的 Diskgroup 添加新磁盘(最佳实践) (文档 ID 1946668.1)”以及我们多年的数据库维护经验,

建议后续添加ASM磁盘时遵从如下步骤:

步骤 操作明细
1 对现有集群状态、ASM磁盘状态、使用率、磁盘大小检查
2 存储层面进行相应的LUN划分,建议划分的LUN大小与原ASM磁盘组中的磁盘大小一致
3 存储层新划分LUN到主机操作系统层面的映射,确认集群中每个主机可以正常识别读写LUN
4 参照已有的ASM磁盘配置相应的别名或多路径映射,集群kfed命令可以读取
5 使用新划分LUN新建临时测试磁盘组确认新划分LUN可以被ASM使用
6 删除新加的测试磁盘组
7 将新划分LUN加入所需的ASM磁盘组中
8 检查ASM磁盘组rebalance完成状态,检查集群及数据库的状态
Oracle Asm 增加磁盘 操作不当引起ASM无法启动故障处理记录

概述

医院数据库监控平台显示连接HIS数据库1节点异常,通过远程的方式接入数据库进行故障排除,快速解决了问题使得业务恢复正常使用。

问题及相关日志分析

1. 检查监听

通过2021/05/18的巡检报告中发现监听中存在1.80的IP地址,而此次检查发现监听中无1.80的IP地址。

 

2. 检查集群状态

检查集群状态,发现集群中多个资源状态显示OFFLINE。

[grid@cxhisdb02 ~]$ crsctl stat res -t

——————————————————————————–

NAME           TARGET  STATE        SERVER                   STATE_DETAILS

——————————————————————————–

Local Resources

——————————————————————————–

ora.DATA.dg

ONLINE  OFFLINE      cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.LISTENER.lsnr

ONLINE  OFFLINE      cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.OCR.dg

ONLINE  OFFLINE      cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.SSD.dg

ONLINE  OFFLINE      cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.asm

ONLINE  OFFLINE      cxhisdb01

ONLINE  ONLINE       cxhisdb02                Started

ora.gsd

OFFLINE OFFLINE      cxhisdb01

OFFLINE OFFLINE      cxhisdb02

ora.net1.network

ONLINE  OFFLINE      cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.ons

ONLINE  OFFLINE      cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.registry.acfs

ONLINE  OFFLINE      cxhisdb01

ONLINE  ONLINE       cxhisdb02

——————————————————————————–

Cluster Resources

——————————————————————————–

ora.LISTENER_SCAN1.lsnr

1        ONLINE  ONLINE       cxhisdb02

ora.cvu

1        ONLINE  OFFLINE

ora.cxhisdb01.vip

1        ONLINE  OFFLINE

ora.cxhisdb02.vip

1        ONLINE  ONLINE       cxhisdb02

ora.hospital.db

1        ONLINE  OFFLINE

2        ONLINE  ONLINE       cxhisdb02                Open

ora.oc4j

1        ONLINE  OFFLINE

ora.scan1.vip

1        ONLINE  ONLINE       cxhisdb02

 

 

3. 检查集群日志

仅有CRSD服务异常,其他集群资源均正常,因此数据库未宕机。

2021-05-26 13:23:46.059:

[crsd(145215)]CRS-1006:The OCR location +OCR is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:46.068:

[crsd(145215)]CRS-1006:The OCR location +OCR is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:56.293:

[/u01/app/11.2.0/grid/bin/oraagent.bin(66885)]CRS-5822:Agent ‘/u01/app/11.2.0/grid/bin/oraagent_grid’ disconnected from server. Details at (:CRSAGF00117:) {0:21:18} in

/u01/app/11.2.0/grid/log/cxhisdb01/agent/crsd/oraagent_grid/oraagent_grid.log.

2021-05-26 13:23:56.294:

[/u01/app/11.2.0/grid/bin/oraagent.bin(31320)]CRS-5822:Agent ‘/u01/app/11.2.0/grid/bin/oraagent_oracle’ disconnected from server. Details at (:CRSAGF00117:) {0:19:5060

3} in /u01/app/11.2.0/grid/log/cxhisdb01/agent/crsd/oraagent_oracle/oraagent_oracle.log.

2021-05-26 13:23:56.461:

[/u01/app/11.2.0/grid/bin/orarootagent.bin(145347)]CRS-5822:Agent ‘/u01/app/11.2.0/grid/bin/orarootagent_root’ disconnected from server. Details at (:CRSAGF00117:) {0:

5:1568} in /u01/app/11.2.0/grid/log/cxhisdb01/agent/crsd/orarootagent_root/orarootagent_root.log.

2021-05-26 13:23:56.485:

[/u01/app/11.2.0/grid/bin/scriptagent.bin(145549)]CRS-5822:Agent ‘/u01/app/11.2.0/grid/bin/scriptagent_grid’ disconnected from server. Details at (:CRSAGF00117:) {0:9:

68} in /u01/app/11.2.0/grid/log/cxhisdb01/agent/crsd/scriptagent_grid/scriptagent_grid.log.

2021-05-26 13:23:56.651:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:23:58.540:

[crsd(5795)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:58.548:

[crsd(5795)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:58.964:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:00.374:

[crsd(5834)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:00.382:

[crsd(5834)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:01.010:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:02.447:

[crsd(5886)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:02.455:

[crsd(5886)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:03.068:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:04.457:

[crsd(5909)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:04.465:

[crsd(5909)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:05.102:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:06.492:

[crsd(5937)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:06.501:

[crsd(5937)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:07.132:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:08.517:

[crsd(5986)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:08.525:

[crsd(5986)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:09.162:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:10.544:

[crsd(6015)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:10.552:

[crsd(6015)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:11.193:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:12.581:

[crsd(6051)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:12.589:

[crsd(6051)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:13.223:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:14.614:

[crsd(6070)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:14.622:

[crsd(6070)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:15.253:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:16.643:

[crsd(6090)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:16.650:

[crsd(6090)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:17.284:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:17.284:

[ohasd(144192)]CRS-2771:Maximum restart attempts reached for resource ‘ora.crsd’; will not restart.

2021-05-26 13:24:17.315:

[ohasd(144192)]CRS-2769:Unable to failover resource ‘ora.crsd’.

 

4. 检查ASM日志

ASM日志中2021/05/26 12:19:57显示OCR仲裁盘有问题,13:23出现读写错误。

Wed May 26 12:19:57 2021

WARNING: Waited 15 secs for write IO to PST disk 0 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 2 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 3 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 4 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 0 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 2 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 3 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 4 in group 3.

Wed May 26 12:19:57 2021

NOTE: process _b000_+asm1 (160488) initiating offline of disk 0.1409468596 (OCR_0000) with mask 0x7e in group 3

NOTE: process _b000_+asm1 (160488) initiating offline of disk 2.1409468594 (OCR_0002) with mask 0x7e in group 3

NOTE: process _b000_+asm1 (160488) initiating offline of disk 3.1409468595 (OCR_0003) with mask 0x7e in group 3

NOTE: process _b000_+asm1 (160488) initiating offline of disk 4.1409468592 (OCR_0004) with mask 0x7e in group 3

NOTE: checking PST: grp = 3

GMON checking disk modes for group 3 at 15 for pid 46, osid 160488

ERROR: no read quorum in group: required 3, found 1 disks

NOTE: checking PST for grp 3 done.

NOTE: initiating PST update: grp = 3, dsk = 0/0x5402c8b4, mask = 0x6a, op = clear

NOTE: initiating PST update: grp = 3, dsk = 2/0x5402c8b2, mask = 0x6a, op = clear

NOTE: initiating PST update: grp = 3, dsk = 3/0x5402c8b3, mask = 0x6a, op = clear

NOTE: initiating PST update: grp = 3, dsk = 4/0x5402c8b0, mask = 0x6a, op = clear

GMON updating disk modes for group 3 at 16 for pid 46, osid 160488

ERROR: no read quorum in group: required 3, found 1 disks

Wed May 26 12:19:57 2021

NOTE: cache dismounting (not clean) group 3/0xA242386E (OCR)

NOTE: messaging CKPT to quiesce pins Unix process pid: 160495, image: oracle@cxhisdb01 (B001)

Wed May 26 12:19:57 2021

NOTE: halting all I/Os to diskgroup 3 (OCR)

Wed May 26 12:19:57 2021

NOTE: LGWR doing non-clean dismount of group 3 (OCR)

NOTE: LGWR sync ABA=15.85 last written ABA 15.85

WARNING: Offline for disk OCR_0000 in mode 0x7f failed.

WARNING: Offline for disk OCR_0002 in mode 0x7f failed.

WARNING: Offline for disk OCR_0003 in mode 0x7f failed.

WARNING: Offline for disk OCR_0004 in mode 0x7f failed.

Wed May 26 12:19:58 2021

kjbdomdet send to inst 2

detach from dom 3, sending detach message to inst 2

Wed May 26 12:19:58 2021

NOTE: No asm libraries found in the system

Wed May 26 12:19:58 2021

List of instances:

1 2

Dirty detach reconfiguration started (new ddet inc 1, cluster inc 4)

Global Resource Directory partially frozen for dirty detach

* dirty detach – domain 3 invalid = TRUE

2 GCS resources traversed, 0 cancelled

Dirty Detach Reconfiguration complete

Wed May 26 12:19:58 2021

WARNING: dirty detached from domain 3

NOTE: cache dismounted group 3/0xA242386E (OCR)

 

 

2021-05-26 13:23:46.059:

[crsd(145215)]CRS-1006:The OCR location +OCR is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:46.068:

[crsd(145215)]CRS-1006:The OCR location +OCR is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

 

 

问题处理步骤

由于数据库仅1节点OCR磁盘组访问异常,2节点以及其他盘访问正常,因而导致1节点CRS资源异常,且数据库未宕机,处理步骤需启动CRS资源,并且重启监听。

1. 1节点挂载OCR磁盘组

[root@cxhisdb01 ~]# su – grid

[grid@cxhisdb01 ~]$ sqlplus / as sysasm

 

SQL*Plus: Release 11.2.0.4.0 Production on Thu May 27 12:12:17 2021

 

Copyright (c) 1982, 2013, Oracle.  All rights reserved.

 

 

Connected to:

Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 – 64bit Production

With the Real Application Clusters and Automatic Storage Management options

 

SQL> alter diskgroup ocr mount;

 

Diskgroup altered.

 

SQL> exit

2. 启动CRS

[grid@cxhisdb01 ~]$ crsctl start res ora.crsd -init

CRS-2672: Attempting to start ‘ora.crsd’ on ‘cxhisdb01’

CRS-2676: Start of ‘ora.crsd’ on ‘cxhisdb01’ succeeded

3. 检查集群资源

[grid@cxhisdb01 ~]$ crsctl stat res -t

——————————————————————————–

NAME           TARGET  STATE        SERVER                   STATE_DETAILS

——————————————————————————–

Local Resources

——————————————————————————–

ora.DATA.dg

ONLINE  ONLINE       cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.LISTENER.lsnr

ONLINE  ONLINE       cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.OCR.dg

ONLINE  ONLINE       cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.SSD.dg

ONLINE  ONLINE       cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.asm

ONLINE  ONLINE       cxhisdb01                Started

ONLINE  ONLINE       cxhisdb02                Started

ora.gsd

OFFLINE OFFLINE      cxhisdb01

OFFLINE OFFLINE      cxhisdb02

ora.net1.network

ONLINE  ONLINE       cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.ons

ONLINE  ONLINE       cxhisdb01

ONLINE  ONLINE       cxhisdb02

ora.registry.acfs

ONLINE  ONLINE       cxhisdb01

ONLINE  ONLINE       cxhisdb02

——————————————————————————–

Cluster Resources

——————————————————————————–

ora.LISTENER_SCAN1.lsnr

1        ONLINE  ONLINE       cxhisdb02

ora.cvu

1        ONLINE  ONLINE       cxhisdb01

ora.cxhisdb01.vip

1        ONLINE  ONLINE       cxhisdb01

ora.cxhisdb02.vip

1        ONLINE  ONLINE       cxhisdb02

ora.hospital.db

1        ONLINE  ONLINE       cxhisdb01                Open

2        ONLINE  ONLINE       cxhisdb02                Open

ora.oc4j

1        ONLINE  ONLINE       cxhisdb01

ora.scan1.vip

1        ONLINE  ONLINE       cxhisdb02

4. 重启监听

[grid@cxhisdb01 ~]$ srvctl stop listener -n cxhisdb01

[grid@cxhisdb01 ~]$ srvctl start listener -n cxhisdb01

[grid@cxhisdb01 ~]$ lsnrctl status

 

LSNRCTL for Linux: Version 11.2.0.4.0 – Production on 27-MAY-2021 12:20:44

 

Copyright (c) 1991, 2013, Oracle.  All rights reserved.

 

Connecting to (DESCRIPTION=(ADDRESS=(PROTOCOL=IPC)(KEY=LISTENER)))

STATUS of the LISTENER

————————

Alias                     LISTENER

Version                   TNSLSNR for Linux: Version 11.2.0.4.0 – Production

Start Date                27-MAY-2021 12:20:40

Uptime                    0 days 0 hr. 0 min. 4 sec

Trace Level               off

Security                  ON: Local OS Authentication

SNMP                      OFF

Listener Parameter File   /u01/app/11.2.0/grid/network/admin/listener.ora

Listener Log File         /u01/app/grid/diag/tnslsnr/cxhisdb01/listener/alert/log.xml

Listening Endpoints Summary…

(DESCRIPTION=(ADDRESS=(PROTOCOL=ipc)(KEY=LISTENER)))

(DESCRIPTION=(ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.80)(PORT=1521)))

(DESCRIPTION=(ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.10)(PORT=1521)))

Services Summary…

Service “hospital” has 1 instance(s).

Instance “hospital1”, status READY, has 1 handler(s) for this service…

Service “hospitalXDB” has 1 instance(s).

Instance “hospital1”, status READY, has 1 handler(s) for this service…

The command completed successfully

[grid@cxhisdb01 ~]$ cd /u01/app/grid/diag/tnslsnr/cxhisdb01/listener/trace/

[grid@cxhisdb01 trace]$ tail -f listener.log

27-MAY-2021 12:20:51 * (CONNECT_DATA=(CID=(PROGRAM=)(HOST=__jdbc__)(USER=))(SERVICE_NAME=hospital)(CID=(PROGRAM=)(HOST=__jdbc__)(USER=))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.171)(PORT=62045)) * establish * hospital * 0

27-MAY-2021 12:20:51 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=ZY-603300-02-YS)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.9.58)(PORT=3676)) * establish * hospital * 0

27-MAY-2021 12:20:51 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=ZY-603300-02-YS)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.9.58)(PORT=3677)) * establish * hospital * 0

27-MAY-2021 12:20:55 * (CONNECT_DATA=(SERVICE_NAME=HOSPITAL)(CID=(PROGRAM=oracle)(HOST=lis-server)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.11)(PORT=23431)) * establish * HOSPITAL * 0

27-MAY-2021 12:20:55 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=JZ-EK-001)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.3.99)(PORT=4442)) * establish * hospital * 0

27-MAY-2021 12:20:55 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=JZ-EK-001)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.3.99)(PORT=4444)) * establish * hospital * 0

27-MAY-2021 12:20:56 * (CONNECT_DATA=(SID=hospital1)(CID=(PROGRAM=配置数据库.exe)(HOST=YYJQZZJFW)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.244)(PORT=58397)) * establish * hospital1 * 0

27-MAY-2021 12:20:56 * (CONNECT_DATA=(SERVICE_NAME=HOSPITAL)(CID=(PROGRAM=oracle)(HOST=lis-server)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.11)(PORT=23432)) * establish * HOSPITAL * 0

27-MAY-2021 12:20:56 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=ZY-603300-02-YS)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.9.58)(PORT=3681)) * establish * hospital * 0

27-MAY-2021 12:20:57 * service_update * hospital1 * 0

27-MAY-2021 12:20:58 * (CONNECT_DATA=(SERVICE_NAME=HOSPITAL)(CID=(PROGRAM=oracle)(HOST=lis-server)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.11)(PORT=23438)) * establish * HOSPITAL * 0

27-MAY-2021 12:20:59 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=JZ-EK-001)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.3.99)(PORT=4460)) * establish * hospital * 0

27-MAY-2021 12:21:00 * service_update * hospital1 * 0

Thu May 27 12:21:01 2021

27-MAY-2021 12:21:01 * (CONNECT_DATA=(SID=hospital1)(CID=(PROGRAM=配置数据库.exe)(HOST=YYJQZZJFW)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.244)(PORT=58398)) * establish * hospital1 * 0

27-MAY-2021 12:21:03 * service_update * hospital1 * 0

27-MAY-2021 12:21:03 * (CONNECT_DATA=(SERVICE_NAME=HOSPITAL)(CID=(PROGRAM=oracle)(HOST=lis-server)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.11)(PORT=23444)) * establish * HOSPITAL * 0

 

总结与后续处理建议

1. 问题总结

HIS数据库1节点访问OCR仲裁盘异常导致CRSD异常,进而引起1节点上多个集群资源offline,监听资源中相较于以前也少了192.2.1.80的IP,因此监控平台显示连接HIS数据库1节点异常,由于2节点和其他磁盘组无异常,因此数据库未发生宕机现象。

2. 处理操作

  • HIS数据库1节点手工mount OCR磁盘组
  • 启动CRS资源
  • 1节点重启监听资源

3. 后续建议

此次故障发生是由于集群资源访问OCR磁盘组异常导致,建议联系存储工程师,排查相关时间点存储运行情况,并定期做好存储巡检和状态监控工作。

Oracle voting disk 故障处理一例

概述

某医院客户于2021年11月5日上午9时左右出现HIS数据库连接异常的情况,在此情况下通过远程的方式接入数据库进行故障排除,快速解决了问题使得业务恢复正常使用。

问题及相关日志分析

1. 问题现状

前台业务反映HIS数据库时而连得上时而连不上,连不上时候报错ORA-12516,报错内容截图如下:

 

由于业务通过节点1的vip(192.2.1.10)连接数据库,测试节点1的vip连接,结果如下:

[oracle@cxhisdb01 ~]$ sqlplus a/b@192.2.1.10:1521/hospital

 

SQL*Plus: Release 11.2.0.4.0 Production on Fri Nov 5 09:45:15 2021

 

Copyright (c) 1982, 2013, Oracle.  All rights reserved.

 

ERROR:

ORA-12516: TNS:listener could not find available handler with matching protocol

stack

 

2. 检查数据库alert日志

ORA-00020: maximum number of processes (4500) exceeded

ORA-20 errors will not be written to the alert log for

the next minute. Please look at trace files to see all

the ORA-20 errors.

Fri Nov 05 09:35:48 2021

Thread 1 advanced to log sequence 191580 (LGWR switch)

Current log# 1 seq# 191580 mem# 0: +SSD/hospital/onlinelog/redo01.log

Fri Nov 05 09:35:48 2021

LNS: Standby redo logfile selected for thread 1 sequence 191580 for destination LOG_ARCHIVE_DEST_4

Fri Nov 05 09:35:48 2021

LNS: Standby redo logfile selected for thread 1 sequence 191580 for destination LOG_ARCHIVE_DEST_3

Fri Nov 05 09:35:48 2021

LNS: Standby redo logfile selected for thread 1 sequence 191580 for destination LOG_ARCHIVE_DEST_2

Fri Nov 05 09:35:48 2021

Archived Log entry 904316 added for thread 1 sequence 191579 ID 0x431aef8a dest 1:

Fri Nov 05 09:37:30 2021

ORA-00020: maximum number of processes (4500) exceeded

ORA-20 errors will not be written to the alert log for

the next minute. Please look at trace files to see all

the ORA-20 errors.

alert中出现大量的ORA-00020错误告警,数据库连接数达到4500上限。

3. 检查数据库参数和连接

SQL> show parameter process;

 

NAME                                 TYPE        VALUE

———————————— ———– ——————————

aq_tm_processes                      integer     1

cell_offload_processing              boolean     TRUE

db_writer_processes                  integer     10

gcs_server_processes                 integer     4

global_txn_processes                 integer     1

job_queue_processes                  integer     1000

log_archive_max_processes            integer     4

processes                            integer     4500

processor_group_name                 string

 

SQL> select inst_id,count(*) from gv$session group by inst_id;

 

INST_ID   COUNT(*)

———- ———-

1       4496

2        715

查询得知,1节点连接数达到4496,已经达到连接数4500的上限,因此前端业务出现连接不上的情况。

4. 定位异常连接

SQL> select program,machine,count(*)

2  from gv$session group by program,machine,sql_id order by 3;

 

 

PROGRAM                                          MACHINE       COUNT(*)

——————————– ————————–  ———-

w3wp.exe                         WORKGROUP\SXZZ_SERVER_CX            10

Medinfo.Quartz.DbJob.exe         WORKGROUP\QUANGUO-NB                10

yszj.exe                         WORKGROUP\MZ-3F-EBHYS7              10

yszj.exe                         WORKGROUP\MZ-2F-NK2YS4              10

yszj.exe                         WORKGROUP\MZ1F-EKYS7                10

emrproject.exe                   WORKGROUP\ZY1-2X-YS2                10

 

PROGRAM                          MACHINE                       COUNT(*)

——————————– ————————–  ———-

yszj.exe                         WORKGROUP\MZ-2F-WKYS9               11

yszj.exe                         WORKGROUP\MZ-2F-FKYS13              11

emrjz.exe                        WORKGROUP\JZ-QJS-HS1                11

yzxt.exe                         WORKGROUP\ZY-604200-01-HS           11

yszj.exe                         WORKGROUP\MZ-2F-GKYS3               12

yszj.exe                         WORKGROUP\MZ-3F-KQYS6               12

yzxt.exe                         WORKGROUP\ZY-609400-02-HS           12

HisServer.exe                    WORKGROUP\YYGHSERV                  12

yszj.exe                         WORKGROUP\GRK-GBMZ-01               12

oracle@lis-server (TNS V1-V3)    lis-server                          12

yszj.exe                         WORKGROUP\MZ-2F-WKYS10              13

 

PROGRAM                          MACHINE                       COUNT(*)

——————————– ————————–  ———-

yszj.exe                         WORKGROUP\MZ-1F-EKYS6               13

yszj.exe                         WORKGROUP\MZ-NKJ-01                 13

yzxt.exe                         WORKGROUP\ZY-ZHNK1-HS-04            13

yszj.exe                         WORKGROUP\MZ-3F-YKYS2               13

yszj.exe                         WORKGROUP\M3-YK-YS1                 14

yszj.exe                         WORKGROUP\MZ-2F-FKYS-01             14

yszj.exe                         WORKGROUP\MZ-3F-YKYS4               14

yszj.exe                         WORKGROUP\MZ-2F-FKYS10              14

yszj.exe                         WORKGROUP\MZ-3F-TJWKYS1             15

yszj.exe                         WORKGROUP\MZ-3F-WKYS5               15

MAF.exe                          WORKGROUP\QUYUSC1                   16

 

PROGRAM                          MACHINE                       COUNT(*)

——————————– ————————–  ———-

yszj.exe                         WORKGROUP\MZ-3F-WKYS3               16

yzxt.exe                         WORKGROUP\HIS-CXYY                  16

yszj.exe                         WORKGROUP\MZ-2F-NK2YS05             16

yszj.exe                         WORKGROUP\MZ-2F-WKYS5               17

yszj.exe                         WORKGROUP\MZ-3F-WKYS4               17

yszj.exe                         WORKGROUP\MZ-3F-KQKYS2              17

w3wp.exe                         WORKGROUP\WX_DDJY                   17

MediII.Adapter.WinForm.Scanner_V WORKGROUP\YYQZ                      18

yszj.exe                         WORKGROUP\MZ-1F-EKYS4               18

yszj.exe                         WORKGROUP\MZ-2F-NK2YS6              18

emrproject.exe                   WORKGROUP\3020-CXYY                 19

 

PROGRAM                          MACHINE                       COUNT(*)

——————————– ————————–  ———-

yszj.exe                         WORKGROUP\MZ-2F-GK24                19

yszj.exe                         WORKGROUP\ZY-609400-02-HS           19

yszj.exe                         WORKGROUP\MZ-GK-25                  20

emrproject.exe                   WORKGROUP\HIS-PC                    20

yszj.exe                         WORKGROUP\ZY-609400-02-YS           21

yszj.exe                         WORKGROUP\MZ-GUKE-YS04              21

yszj.exe                         WORKGROUP\MZ-2F-FKYS9               22

yszj.exe                         MSHOME\JZ-WKYS2                     22

yszj.exe                         WORKGROUP\MZ-2F-NK2YS-01            22

w3wp.exe                         WORKGROUP\YINYIQIANZHI1             23

yszj.exe                         WORKGROUP\MZ-2F-WKYS4               23

 

PROGRAM                          MACHINE                       COUNT(*)

——————————– ————————–  ———-

yszj.exe                         WORKGROUP\MZ-2F-GKYS2               23

yszj.exe                         WORKGROUP\MZ-2F-FKYS6               24

yszj.exe                         WORKGROUP\MZ-2F-WKYS6               25

yszj.exe                         WORKGROUP\MZ-XEK7                   25

yszj.exe                         WORKGROUP\MZ-3F-EBYHYS2             25

yszj.exe                         WORKGROUP\MZ-1F-EKYS8               26

yszj.exe                         WORKGROUP\MZ-3F-ZYYS-8              26

yszj.exe                         WORKGROUP\MZ-2F-FKYS3               27

yszj.exe                         WORKGROUP\MZ-2F-WKYS2               27

yszj.exe                         WORKGROUP\MZ-2F-WKYS3               28

yszj.exe                         WORKGROUP\MZ-2F-NK2YS2              28

 

PROGRAM                          MACHINE                       COUNT(*)

——————————– ————————–  ———-

yszj.exe                         WORKGROUP\MZ-2F-NK2-YS09            29

oracle@cxemrdb01 (TNS V1-V3)     cxemrdb01                           30

yszj.exe                         WORKGROUP\MZ-2F-NK2-9               30

yszj.exe                         WORKGROUP\MZ-3F-ZYYS2               30

yszj.exe                         WORKGROUP\MZ-3F-EBHYS6              30

yszj.exe                         WORKGROUP\MZ-2F-GKYS21              31

yszj.exe                         WORKGROUP\MZ-WAIKE-YS12             33

yzxt.exe                         WORKGROUP\ZY-7D-HS3                 33

yszj.exe                         WORKGROUP\MZ-2F-FKYS5               34

yszj.exe                         WORKGROUP\MZ-2F-NK1YS3              34

yszj.exe                         WORKGROUP\DESKTOP-VI7O6SC           36

 

PROGRAM                          MACHINE                       COUNT(*)

——————————– ————————–  ———-

yszj.exe                         WORKGROUP\MZ-2F-WKYS1               38

yszj.exe                         WORKGROUP\MZ-NK2-07                 39

yszj.exe                         WORKGROUP\MZ-2F-FKYS11              42

yszj.exe                         WORKGROUP\MZ-2F-NK1YS5              43

yszj.exe                         WORKGROUP\JZ-EK-001                 45

yszj.exe                         WORKGROUP\LHMZ_YS2                  46

yszj.exe                         WORKGROUP\MZ-2F-NK1YS2              50

w3wp.exe                         WORKGROUP\HIS-YIYAO                 50

yszj.exe                         WORKGROUP\MZ-2F-WKYS11              56

yszj.exe                         WORKGROUP\LHMZ_YS1                  56

yszj.exe                         WORKGROUP\MZ-2F-NK1YS14             58

 

PROGRAM                          MACHINE                       COUNT(*)

——————————– ————————–  ———-

yszj.exe                         WORKGROUP\MZ-2F-NK1YS13             62

yszj.exe                         WORKGROUP\JZ-NK-2                   64

yszj.exe                         WORKGROUP\MZ-2F-WKYS007             66

yszj.exe                         WORKGROUP\MZ-2F-FKYS4               66

yszj.exe                         WORKGROUP\MZ-2F-NKYS7               69

yszj.exe                         WORKGROUP\MZ-2F-NK1YS4             103

yszj.exe                         WORKGROUP\MZ-PAOZHEN-01            108

oracle@lis-server (TNS V1-V3)    lis-server                         136

oracle@cxemrdb01 (TNS V1-V3)     cxemrdb01                          395

查询得知,yszj.exe应用程序对HIS数据库建立了较高数量的连接,其中主机名为“MZ-2F”开头的客户端尤为明显。

 

 

 

 

 

 

 

 

 

 

 

 

三、问题处理步骤

1. kill session

查询yszj.exe应用程序的进程ID

SQL> select b.spid

2  from v$session a,v$process b

3  where a.paddr=b.addr

4  and a.program=’yszj.exe’;

5  and a.machine like ‘MZ-2F%’;

 

SPID

————————

70593

72017

76189

77305

78409

78481

78636

78676

79541

80183

80816

杀会话

[oracle@cxhisdb01 ~]$ kill -9 70593

[oracle@cxhisdb01 ~]$ kill -9 72017

[oracle@cxhisdb01 ~]$ kill -9 76189

[oracle@cxhisdb01 ~]$ kill -9 77305

[oracle@cxhisdb01 ~]$ kill -9 78409

[oracle@cxhisdb01 ~]$ kill -9 78481

[oracle@cxhisdb01 ~]$ kill -9 78636

[oracle@cxhisdb01 ~]$ kill -9 78676

[oracle@cxhisdb01 ~]$ kill -9 79541

[oracle@cxhisdb01 ~]$ kill -9 80183

[oracle@cxhisdb01 ~]$ kill -9 80816

执行kill session后,效果不明显,因为会话查杀后,又有新的会话连接进来。

2. 客户沟通

通过与客户沟通了解到,YZSJ.EXE程序新上了检查报告互认功能,会导致程序连接不释放问题,导致程序到数据库连接数不断增加,从而数据库连接数达到上限。

3. 释放进程

客户安排前台业务程序主机重启,释放连接进程,业务端主机重启后,检查连接情况如下:

SQL> select inst_id,count(*) from gv$session group by inst_id;

 

INST_ID   COUNT(*)

———- ———-

1       3427

2        681

 

SQL> select program,machine,count(*)

2  from gv$session  where program=’yszj.exe’ group by program,machine order by 3;

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\XZ-YGK-07                     1

yszj.exe        WORKGROUP\YJ-NKJ-MZFT                   1

yszj.exe        WORKGROUP\MZ-3F-WKYS4                   2

yszj.exe        WORKGROUP\RLS                           2

yszj.exe        WORKGROUP\MZ-3F-KQKYS2                  2

yszj.exe        WORKGROUP\MZ-2F-GKYS3                   2

yszj.exe        WORKGROUP\MZ-FWT-01                     2

yszj.exe        WORKGROUP\FDHJ54365                     2

yszj.exe        WORKGROUP\MZ-2F-WKYS2                   2

yszj.exe        WORKGROUP\MZ-3F-KQK7                    2

yszj.exe        WORKGROUP\MZ-2F-WKYS9                   2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-2F-NK1-YS6                 2

yszj.exe        WORKGROUP\MZ-2F-FKYS9                   2

yszj.exe        WORKGROUP\MZ-2F-GKYS2                   2

yszj.exe        WORKGROUP\MZ-2F-WKYS6                   2

yszj.exe        WORKGROUP\CXYY-YZ2                      2

yszj.exe        WORKGROUP\MZ-2F-FKYS13                  2

yszj.exe        WORKGROUP\MZ-KQK-YS-1                   2

yszj.exe        WORKGROUP\MZ-2F-NK1YS13                 2

yszj.exe        WORKGROUP\ZY-XYJHZX-01-YS               2

yszj.exe        WORKGROUP\MZ-2F-WKYS8                   2

yszj.exe        WORKGROUP\MZ-2F-FKYS4                   2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-2F-NK1YS4                  2

yszj.exe        WORKGROUP\MZ-2F-WKYS3                   2

yszj.exe        WORKGROUP\MZ-2F-NKYS7                   2

yszj.exe        WORKGROUP\MZ-3F-WKYS5                   2

yszj.exe        WORKGROUP\MZ-2F-WKYS007                 2

yszj.exe        WORKGROUP\GAMZ-01                       2

yszj.exe        WORKGROUP\MZ-WJ3                        2

yszj.exe        WORKGROUP\MZ-3F-PFYS3                   2

yszj.exe        WORKGROUP\ZY-609400-03-HS               2

yszj.exe        WORKGROUP\MZ-2F-WKYS1                   2

yszj.exe        WORKGROUP\ZY1-4X-YS5                    2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-2F-FKYS7                   2

yszj.exe        WORKGROUP\MZ-3F-TJWKYS1                 2

yszj.exe        WORKGROUP\ZY-601700-05-YS               2

yszj.exe        WORKGROUP\MZ-PICC-02                    2

yszj.exe        WORKGROUP\JZ-EK-001                     2

yszj.exe        WORKGROUP\FSK-SSZX-02                   2

yszj.exe        WORKGROUP\ZY1-5D-YS4                    2

yszj.exe        WORKGROUP\MZ-3F-KFYJT                   2

yszj.exe        WORKGROUP\ZY-XSR-YS5                    2

yszj.exe        WORKGROUP\MZ-ZY-YS9                     2

yszj.exe        WORKGROUP\MZ-3F-EBYHYS2                 2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-2F-FKYS15                  2

yszj.exe        WORKGROUP\MZ-2F-FK12                    2

yszj.exe        WORKGROUP\MZ-2F-NK1YS5                  2

yszj.exe        WORKGROUP\MZ-WK-SKZK                    2

yszj.exe        WORKGROUP\MZ-2F-NK1YS-17                2

yszj.exe        WORKGROUP\MZ-2F-FKYS10                  2

yszj.exe        WORKGROUP\MZ-2F-GK24                    2

yszj.exe        WORKGROUP\MZ-2F-NK2YS10                 2

yszj.exe        WORKGROUP\MZ-KOUQIANG-Y11               2

yszj.exe        WORKGROUP\JZ-EK-YS-06                   2

yszj.exe        WORKGROUP\MZ-NKJ-01                     2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-2F-NK2-9                   2

yszj.exe        WORKGROUP\MZ-3F-ZYYS2                   2

yszj.exe        WORKGROUP\MZ-3F-EBHYS6                  2

yszj.exe        MSHOME\JZ-QJ-04                         2

yszj.exe        WORKGROUP\MZ-3F-YKYS2                   2

yszj.exe        WORKGROUP\MZ-3F-ZY-YS3-                 2

yszj.exe        WORKGROUP\MZ-2F-NK1YS2                  2

yszj.exe        WORKGROUP\MZ-KOUQIANG-Y13               2

yszj.exe        WORKGROUP\MZ-3F-ZYYS10                  2

yszj.exe        WORKGROUP\JZ-EBH-01                     2

yszj.exe        WORKGROUP\JZ-GK-YS1                     2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-NK1-YS-017                 2

yszj.exe        WORKGROUP\MZ1F-EKYS7                    2

yszj.exe        WORKGROUP\JZ-NK-2                       2

yszj.exe        WORKGROUP\MZ-2F-FKYS11                  2

yszj.exe        WORKGROUP\MZ-2F-FKYS6                   2

yszj.exe        WORKGROUP\MZ-2F-FKYS-01                 2

yszj.exe        WORKGROUP\MZ-2F-NK1YS3                  2

yszj.exe        WORKGROUP\ZY2-11F-YS2                   2

yszj.exe        WORKGROUP\MZ-WAIKE-YS12                 2

yszj.exe        WORKGROUP\MZ-2F-NK2-YS09                2

yszj.exe        WORKGROUP\MZ-FWT-03                     2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-2F-NK2YS6                  2

yszj.exe        WORKGROUP\MZ-1F-EKYS6                   2

yszj.exe        WORKGROUP\JZ_XIONGTONG                  2

yszj.exe        WORKGROUP\MZ-3F-KQYS6                   2

yszj.exe        WORKGROUP\MZ-3F-YK13                    2

yszj.exe        WORKGROUP\ZY-TXBQ-YYK                   2

yszj.exe        WORKGROUP\LXZL-205                      2

yszj.exe        WORKGROUP\MZ-NK-11-YS                   2

yszj.exe        WORKGROUP\MZ-3F-PFK-2                   2

yszj.exe        WORKGROUP\MZ-NK-YS8                     2

yszj.exe        WORKGROUP\MZ-2F-NK2YS2                  2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-3F-KQKYS4                  2

yszj.exe        WORKGROUP\MZ-2F-NK2YS-01                2

yszj.exe        WORKGROUP\MZ-2F-NK1YS14                 2

yszj.exe        WORKGROUP\MZ-3F-ZYYS1                   2

yszj.exe        WORKGROUP\FRMZ-YS-03                    2

yszj.exe        WORKGROUP\MZ-NKZJ-01                    2

yszj.exe        WORKGROUP\MZ-2F-NK2YS3                  2

yszj.exe        WORKGROUP\MZ-2F-NK2YS05                 2

yszj.exe        WORKGROUP\MZ-XEK7                       2

yszj.exe        WORKGROUP\MZ-ZJNK-YS8-A                 2

yszj.exe        WORKGROUP\MZ-ZHFW-05                    2

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-2F-FKYS3                   2

yszj.exe        WORKGROUP\ZY2-6F-ZLNKYS01               2

yszj.exe        WORKGROUP\MZ-FWT-04                     2

yszj.exe        WORKGROUP\FRMZ_YS2                      2

yszj.exe        WORKGROUP\MZ-3F-WKYS3                   2

yszj.exe        WORKGROUP\JZ-QJS-YS3                    2

yszj.exe        WORKGROUP\ZY-703700-05-YS               2

yszj.exe        WORKGROUP\TSZX-02                       3

yszj.exe        WORKGROUP\MZ-3F-KQXFS1                  3

yszj.exe        WORKGROUP\DESKTOP-VI7O6SC               3

yszj.exe        WORKGROUP\ZY-600400-06-YS               3

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\FSK-DPS-08                    4

yszj.exe        WORKGROUP\MZ-3F-ZYYS5                   4

yszj.exe        MSHOME\JZ-WKYS01                        4

yszj.exe        WORKGROUP\SF-YWS-YS1                    5

yszj.exe        WORKGROUP\MZ-3F-ZYYS4                   5

yszj.exe        WORKGROUP\ZY1-1X-YS                     5

yszj.exe        WorkGroup\USER-2021SXCVV                5

yszj.exe        WORKGROUP\MZ2-NJZX-001                  5

yszj.exe        WORKGROUP\ZY-GK3-HS4                    6

yszj.exe        WORKGROUP\MZ-GB-CDMZ                    6

yszj.exe        WORKGROUP\MZ-2F-NK2YS11                 8

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-3L-ZYNK-007                9

yszj.exe        WORKGROUP\MZ-3F-ZJK1                   11

yszj.exe        WORKGROUP\MZ-3F-KQKYS5                 14

yszj.exe        WORKGROUP\HIS-PC                       15

yszj.exe        WORKGROUP\MZ-2F-WKYS10                 16

yszj.exe        WORKGROUP\GRK-GBMZ-01                  16

yszj.exe        WORKGROUP\MZ-PIFU-YS3                  18

yszj.exe        WORKGROUP\ZY-609400-02-YS              21

yszj.exe        WORKGROUP\MZ-3F-KQKYS3                 22

yszj.exe        WORKGROUP\MZ-PAOZHEN-01                23

yszj.exe        WORKGROUP\MZ-GUKE-YS04                 23

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\MZ-FR-EK-1                   24

yszj.exe        WORKGROUP\ZY-609400-02-HS              25

yszj.exe        WORKGROUP\MZ-3F-PFKYS24                25

yszj.exe        WORKGROUP\MZ-2F-WKYS5                  34

yszj.exe        WORKGROUP\MZ-3F-YKYS4                  34

yszj.exe        WORKGROUP\MZ-2F-WKYS4                  35

yszj.exe        WORKGROUP\MZ-1F-EKYS8                  35

yszj.exe        MSHOME\JZ-WKYS2                        35

yszj.exe        WORKGROUP\MZ-3F-ZYYS-8                 37

yszj.exe        WORKGROUP\MZ-1F-EKYS4                  45

yszj.exe        WORKGROUP\LHMZ_YS2                     64

 

PROGRAM         MACHINE                          COUNT(*)

————— —————————— ———-

yszj.exe        WORKGROUP\LHMZ_YS1                     78

yszj.exe        WORKGROUP\MZ-2F-WKYS11                 87

 

145 rows selected.

通过查询得知,前端业务主机重启后,释放了一部分进程,节点1连接数已经下降至3400+,前端业务连接正常,未发现ORA-12516错误。

 

 

总结与后续处理建议

1. 问题总结

YZSJ.EXE应用程序通过192.2.1.10(节点1的VIP)连接HIS数据库,由于YZSJ.EXE程序新上了检查报告互认功能,有程序连接不释放问题,导致程序到数据库连接数不断增加,数据库连接数(4500)达到上限,从而导致前端业务连接数据库失败,出现ORA-12516:TNS:监听程序找不到符合协议堆栈要求的可用处理程序。

2. 处理操作

  • 查杀会话,释放进程资源 ==>效果不明显,会话杀掉后程序会自动重连
  • 重启业务前端主机,释放资源 ==>暂时缓解,数据库连接数已经下降至3400+
  • 修复exe应用程序  ==>彻底解决,预计中午业务研发部门对yszj.exe做修复

3. 后续建议

此次故障发生的根本原因是由于YZSJ.EXE程序更新了检查报告互认功能导致的程序连接不释放导致数据库连接数达到上限,从而导致前端业务无法连接数据库,针对此次发生的故障,建议点如下:

  • 业务测试,业务更新上线前,在开发库或测试环境做充分的业务功能性测试和性能测试;
  • 调整数据库连接数参数,当前数据库的process参数为4500,可根据实际需要适当调整增大数据库的process参数,调整此参数需要重启数据库生效,因此需要安排停机时间。

 

Oracle 连接异常处理故障一例